feat(transcription): add Whisper transcriber and audio utilities
- Add WhisperTranscriber wrapper for stable-ts/faster-whisper - Add audio utilities for ffmpeg/ffprobe operations - Add translator for two-stage translation workflow - Support CPU/GPU with graceful degradation
This commit is contained in:
354
backend/transcription/audio_utils.py
Normal file
354
backend/transcription/audio_utils.py
Normal file
@@ -0,0 +1,354 @@
|
||||
"""Audio processing utilities extracted from transcriptarr.py."""
|
||||
import logging
|
||||
import os
|
||||
from io import BytesIO
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
import ffmpeg
|
||||
|
||||
# Optional import - graceful degradation if not available
|
||||
try:
|
||||
import av
|
||||
AV_AVAILABLE = True
|
||||
except ImportError:
|
||||
av = None
|
||||
AV_AVAILABLE = False
|
||||
logging.warning("av (PyAV) not available. Some audio features may not work.")
|
||||
|
||||
from backend.core.language_code import LanguageCode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def extract_audio_segment(
|
||||
input_file: str,
|
||||
start_time: int,
|
||||
duration: int,
|
||||
) -> BytesIO:
|
||||
"""
|
||||
Extract a segment of audio from a file to memory.
|
||||
|
||||
Args:
|
||||
input_file: Path to input media file
|
||||
start_time: Start time in seconds
|
||||
duration: Duration in seconds
|
||||
|
||||
Returns:
|
||||
BytesIO object containing audio segment
|
||||
"""
|
||||
try:
|
||||
logger.debug(f"Extracting audio: {input_file}, start={start_time}s, duration={duration}s")
|
||||
|
||||
out, _ = (
|
||||
ffmpeg.input(input_file, ss=start_time, t=duration)
|
||||
.output("pipe:1", format="wav", acodec="pcm_s16le", ar=16000)
|
||||
.run(capture_stdout=True, capture_stderr=True)
|
||||
)
|
||||
|
||||
if not out:
|
||||
raise ValueError("FFmpeg output is empty")
|
||||
|
||||
return BytesIO(out)
|
||||
|
||||
except ffmpeg.Error as e:
|
||||
logger.error(f"FFmpeg error: {e.stderr.decode()}")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting audio: {e}")
|
||||
raise
|
||||
|
||||
|
||||
def get_audio_tracks(video_file: str) -> List[Dict]:
|
||||
"""
|
||||
Get information about audio tracks in a media file.
|
||||
|
||||
Args:
|
||||
video_file: Path to media file
|
||||
|
||||
Returns:
|
||||
List of dicts with audio track information
|
||||
"""
|
||||
try:
|
||||
probe = ffmpeg.probe(video_file, select_streams="a")
|
||||
audio_streams = probe.get("streams", [])
|
||||
|
||||
audio_tracks = []
|
||||
for stream in audio_streams:
|
||||
# Get all possible language tags - check multiple locations
|
||||
tags = stream.get("tags", {})
|
||||
|
||||
# Try different common tag names (MKV uses different conventions)
|
||||
lang_tag = (
|
||||
tags.get("language") or # Standard location
|
||||
tags.get("LANGUAGE") or # Uppercase variant
|
||||
tags.get("lang") or # Short form
|
||||
stream.get("language") or # Sometimes at stream level
|
||||
"und" # Default: undefined
|
||||
)
|
||||
|
||||
# Log ALL tags for debugging
|
||||
logger.debug(
|
||||
f"Audio track {stream.get('index')}: "
|
||||
f"codec={stream.get('codec_name')}, "
|
||||
f"lang_tag='{lang_tag}', "
|
||||
f"all_tags={tags}"
|
||||
)
|
||||
|
||||
language = LanguageCode.from_iso_639_2(lang_tag)
|
||||
|
||||
# Log when language is undefined
|
||||
if lang_tag == "und" or language is None:
|
||||
logger.warning(
|
||||
f"Audio track {stream.get('index')} in {video_file}: "
|
||||
f"Language undefined (tag='{lang_tag}'). "
|
||||
f"Available tags: {list(tags.keys())}"
|
||||
)
|
||||
|
||||
audio_track = {
|
||||
"index": int(stream.get("index", 0)),
|
||||
"codec": stream.get("codec_name", "unknown"),
|
||||
"channels": int(stream.get("channels", 0)),
|
||||
"language": language,
|
||||
"title": tags.get("title", ""),
|
||||
"default": stream.get("disposition", {}).get("default", 0) == 1,
|
||||
"forced": stream.get("disposition", {}).get("forced", 0) == 1,
|
||||
"original": stream.get("disposition", {}).get("original", 0) == 1,
|
||||
"commentary": "commentary" in tags.get("title", "").lower(),
|
||||
}
|
||||
audio_tracks.append(audio_track)
|
||||
|
||||
return audio_tracks
|
||||
|
||||
except ffmpeg.Error as e:
|
||||
logger.error(f"FFmpeg error: {e.stderr}")
|
||||
return []
|
||||
except Exception as e:
|
||||
logger.error(f"Error reading audio tracks: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def extract_audio_track_to_memory(
|
||||
input_video_path: str, track_index: int
|
||||
) -> Optional[BytesIO]:
|
||||
"""
|
||||
Extract a specific audio track to memory.
|
||||
|
||||
Args:
|
||||
input_video_path: Path to video file
|
||||
track_index: Audio track index
|
||||
|
||||
Returns:
|
||||
BytesIO with audio data or None
|
||||
"""
|
||||
if track_index is None:
|
||||
logger.warning(f"Skipping audio track extraction for {input_video_path}")
|
||||
return None
|
||||
|
||||
try:
|
||||
out, _ = (
|
||||
ffmpeg.input(input_video_path)
|
||||
.output(
|
||||
"pipe:",
|
||||
map=f"0:{track_index}",
|
||||
format="wav",
|
||||
ac=1,
|
||||
ar=16000,
|
||||
loglevel="quiet",
|
||||
)
|
||||
.run(capture_stdout=True, capture_stderr=True)
|
||||
)
|
||||
return BytesIO(out)
|
||||
|
||||
except ffmpeg.Error as e:
|
||||
logger.error(f"FFmpeg error extracting track: {e.stderr.decode()}")
|
||||
return None
|
||||
|
||||
|
||||
def get_audio_languages(video_path: str) -> List[LanguageCode]:
|
||||
"""
|
||||
Extract language codes from audio streams.
|
||||
|
||||
Args:
|
||||
video_path: Path to video file
|
||||
|
||||
Returns:
|
||||
List of LanguageCode objects
|
||||
"""
|
||||
audio_tracks = get_audio_tracks(video_path)
|
||||
return [track["language"] for track in audio_tracks]
|
||||
|
||||
|
||||
def get_subtitle_languages(video_path: str) -> List[LanguageCode]:
|
||||
"""
|
||||
Extract language codes from subtitle streams.
|
||||
|
||||
Args:
|
||||
video_path: Path to video file
|
||||
|
||||
Returns:
|
||||
List of LanguageCode objects
|
||||
"""
|
||||
languages = []
|
||||
|
||||
try:
|
||||
with av.open(video_path) as container:
|
||||
for stream in container.streams.subtitles:
|
||||
lang_code = stream.metadata.get("language")
|
||||
if lang_code:
|
||||
languages.append(LanguageCode.from_iso_639_2(lang_code))
|
||||
else:
|
||||
languages.append(LanguageCode.NONE)
|
||||
except Exception as e:
|
||||
logger.error(f"Error reading subtitle languages: {e}")
|
||||
|
||||
return languages
|
||||
|
||||
|
||||
def has_audio(file_path: str) -> bool:
|
||||
"""
|
||||
Check if a file has valid audio streams.
|
||||
|
||||
Args:
|
||||
file_path: Path to media file
|
||||
|
||||
Returns:
|
||||
True if file has audio, False otherwise
|
||||
"""
|
||||
if not AV_AVAILABLE or av is None:
|
||||
logger.warning(f"av (PyAV) not available, cannot check audio for {file_path}")
|
||||
# Assume file has audio if we can't check
|
||||
return True
|
||||
|
||||
try:
|
||||
if not os.path.isfile(file_path):
|
||||
return False
|
||||
|
||||
with av.open(file_path) as container:
|
||||
for stream in container.streams:
|
||||
if stream.type == "audio":
|
||||
if stream.codec_context and stream.codec_context.name != "none":
|
||||
return True
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
# Catch all exceptions since av.FFmpegError might not exist if av is None
|
||||
logger.debug(f"Error checking audio in {file_path}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def has_subtitle_language_in_file(
|
||||
video_file: str, target_language: LanguageCode
|
||||
) -> bool:
|
||||
"""
|
||||
Check if video has embedded subtitles in target language.
|
||||
|
||||
Args:
|
||||
video_file: Path to video file
|
||||
target_language: Language to check for
|
||||
|
||||
Returns:
|
||||
True if subtitles exist in target language
|
||||
"""
|
||||
if not AV_AVAILABLE or av is None:
|
||||
logger.warning(f"av (PyAV) not available, cannot check subtitles for {video_file}")
|
||||
return False
|
||||
|
||||
try:
|
||||
with av.open(video_file) as container:
|
||||
subtitle_streams = [
|
||||
stream
|
||||
for stream in container.streams
|
||||
if stream.type == "subtitle" and "language" in stream.metadata
|
||||
]
|
||||
|
||||
for stream in subtitle_streams:
|
||||
stream_language = LanguageCode.from_string(
|
||||
stream.metadata.get("language", "").lower()
|
||||
)
|
||||
if stream_language == target_language:
|
||||
logger.debug(f"Found subtitles in '{target_language}' in video")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking subtitles: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def has_subtitle_of_language_in_folder(
|
||||
video_file: str, target_language: LanguageCode
|
||||
) -> bool:
|
||||
"""
|
||||
Check if external subtitle file exists for video.
|
||||
|
||||
Args:
|
||||
video_file: Path to video file
|
||||
target_language: Language to check for
|
||||
|
||||
Returns:
|
||||
True if external subtitle exists
|
||||
"""
|
||||
subtitle_extensions = {".srt", ".vtt", ".sub", ".ass", ".ssa"}
|
||||
|
||||
video_folder = os.path.dirname(video_file)
|
||||
video_name = os.path.splitext(os.path.basename(video_file))[0]
|
||||
|
||||
try:
|
||||
for file_name in os.listdir(video_folder):
|
||||
if not any(file_name.endswith(ext) for ext in subtitle_extensions):
|
||||
continue
|
||||
|
||||
subtitle_name, _ = os.path.splitext(file_name)
|
||||
|
||||
if not subtitle_name.startswith(video_name):
|
||||
continue
|
||||
|
||||
# Extract language from filename
|
||||
parts = subtitle_name[len(video_name) :].lstrip(".").split(".")
|
||||
|
||||
for part in parts:
|
||||
if LanguageCode.from_string(part) == target_language:
|
||||
logger.debug(f"Found external subtitle: {file_name}")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking external subtitles: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def handle_multiple_audio_tracks(
|
||||
file_path: str, language: Optional[LanguageCode] = None
|
||||
) -> Optional[BytesIO]:
|
||||
"""
|
||||
Handle files with multiple audio tracks.
|
||||
|
||||
Args:
|
||||
file_path: Path to media file
|
||||
language: Preferred language
|
||||
|
||||
Returns:
|
||||
BytesIO with extracted audio or None
|
||||
"""
|
||||
audio_tracks = get_audio_tracks(file_path)
|
||||
|
||||
if len(audio_tracks) <= 1:
|
||||
return None
|
||||
|
||||
logger.debug(f"Handling {len(audio_tracks)} audio tracks")
|
||||
|
||||
# Find track by language
|
||||
audio_track = None
|
||||
if language:
|
||||
for track in audio_tracks:
|
||||
if track["language"] == language:
|
||||
audio_track = track
|
||||
break
|
||||
|
||||
# Fallback to first track
|
||||
if not audio_track:
|
||||
audio_track = audio_tracks[0]
|
||||
|
||||
return extract_audio_track_to_memory(file_path, audio_track["index"])
|
||||
Reference in New Issue
Block a user