feat(transcription): add Whisper transcriber and audio utilities

- Add WhisperTranscriber wrapper for stable-ts/faster-whisper
- Add audio utilities for ffmpeg/ffprobe operations
- Add translator for two-stage translation workflow
- Support CPU/GPU with graceful degradation
This commit is contained in:
2026-01-16 16:55:02 +01:00
parent d28c4caa6a
commit cbf5ef9623
4 changed files with 965 additions and 0 deletions

View File

@@ -0,0 +1,354 @@
"""Audio processing utilities extracted from transcriptarr.py."""
import logging
import os
from io import BytesIO
from typing import List, Dict, Optional
import ffmpeg
# Optional import - graceful degradation if not available
try:
import av
AV_AVAILABLE = True
except ImportError:
av = None
AV_AVAILABLE = False
logging.warning("av (PyAV) not available. Some audio features may not work.")
from backend.core.language_code import LanguageCode
logger = logging.getLogger(__name__)
def extract_audio_segment(
input_file: str,
start_time: int,
duration: int,
) -> BytesIO:
"""
Extract a segment of audio from a file to memory.
Args:
input_file: Path to input media file
start_time: Start time in seconds
duration: Duration in seconds
Returns:
BytesIO object containing audio segment
"""
try:
logger.debug(f"Extracting audio: {input_file}, start={start_time}s, duration={duration}s")
out, _ = (
ffmpeg.input(input_file, ss=start_time, t=duration)
.output("pipe:1", format="wav", acodec="pcm_s16le", ar=16000)
.run(capture_stdout=True, capture_stderr=True)
)
if not out:
raise ValueError("FFmpeg output is empty")
return BytesIO(out)
except ffmpeg.Error as e:
logger.error(f"FFmpeg error: {e.stderr.decode()}")
raise
except Exception as e:
logger.error(f"Error extracting audio: {e}")
raise
def get_audio_tracks(video_file: str) -> List[Dict]:
"""
Get information about audio tracks in a media file.
Args:
video_file: Path to media file
Returns:
List of dicts with audio track information
"""
try:
probe = ffmpeg.probe(video_file, select_streams="a")
audio_streams = probe.get("streams", [])
audio_tracks = []
for stream in audio_streams:
# Get all possible language tags - check multiple locations
tags = stream.get("tags", {})
# Try different common tag names (MKV uses different conventions)
lang_tag = (
tags.get("language") or # Standard location
tags.get("LANGUAGE") or # Uppercase variant
tags.get("lang") or # Short form
stream.get("language") or # Sometimes at stream level
"und" # Default: undefined
)
# Log ALL tags for debugging
logger.debug(
f"Audio track {stream.get('index')}: "
f"codec={stream.get('codec_name')}, "
f"lang_tag='{lang_tag}', "
f"all_tags={tags}"
)
language = LanguageCode.from_iso_639_2(lang_tag)
# Log when language is undefined
if lang_tag == "und" or language is None:
logger.warning(
f"Audio track {stream.get('index')} in {video_file}: "
f"Language undefined (tag='{lang_tag}'). "
f"Available tags: {list(tags.keys())}"
)
audio_track = {
"index": int(stream.get("index", 0)),
"codec": stream.get("codec_name", "unknown"),
"channels": int(stream.get("channels", 0)),
"language": language,
"title": tags.get("title", ""),
"default": stream.get("disposition", {}).get("default", 0) == 1,
"forced": stream.get("disposition", {}).get("forced", 0) == 1,
"original": stream.get("disposition", {}).get("original", 0) == 1,
"commentary": "commentary" in tags.get("title", "").lower(),
}
audio_tracks.append(audio_track)
return audio_tracks
except ffmpeg.Error as e:
logger.error(f"FFmpeg error: {e.stderr}")
return []
except Exception as e:
logger.error(f"Error reading audio tracks: {e}")
return []
def extract_audio_track_to_memory(
input_video_path: str, track_index: int
) -> Optional[BytesIO]:
"""
Extract a specific audio track to memory.
Args:
input_video_path: Path to video file
track_index: Audio track index
Returns:
BytesIO with audio data or None
"""
if track_index is None:
logger.warning(f"Skipping audio track extraction for {input_video_path}")
return None
try:
out, _ = (
ffmpeg.input(input_video_path)
.output(
"pipe:",
map=f"0:{track_index}",
format="wav",
ac=1,
ar=16000,
loglevel="quiet",
)
.run(capture_stdout=True, capture_stderr=True)
)
return BytesIO(out)
except ffmpeg.Error as e:
logger.error(f"FFmpeg error extracting track: {e.stderr.decode()}")
return None
def get_audio_languages(video_path: str) -> List[LanguageCode]:
"""
Extract language codes from audio streams.
Args:
video_path: Path to video file
Returns:
List of LanguageCode objects
"""
audio_tracks = get_audio_tracks(video_path)
return [track["language"] for track in audio_tracks]
def get_subtitle_languages(video_path: str) -> List[LanguageCode]:
"""
Extract language codes from subtitle streams.
Args:
video_path: Path to video file
Returns:
List of LanguageCode objects
"""
languages = []
try:
with av.open(video_path) as container:
for stream in container.streams.subtitles:
lang_code = stream.metadata.get("language")
if lang_code:
languages.append(LanguageCode.from_iso_639_2(lang_code))
else:
languages.append(LanguageCode.NONE)
except Exception as e:
logger.error(f"Error reading subtitle languages: {e}")
return languages
def has_audio(file_path: str) -> bool:
"""
Check if a file has valid audio streams.
Args:
file_path: Path to media file
Returns:
True if file has audio, False otherwise
"""
if not AV_AVAILABLE or av is None:
logger.warning(f"av (PyAV) not available, cannot check audio for {file_path}")
# Assume file has audio if we can't check
return True
try:
if not os.path.isfile(file_path):
return False
with av.open(file_path) as container:
for stream in container.streams:
if stream.type == "audio":
if stream.codec_context and stream.codec_context.name != "none":
return True
return False
except Exception as e:
# Catch all exceptions since av.FFmpegError might not exist if av is None
logger.debug(f"Error checking audio in {file_path}: {e}")
return False
def has_subtitle_language_in_file(
video_file: str, target_language: LanguageCode
) -> bool:
"""
Check if video has embedded subtitles in target language.
Args:
video_file: Path to video file
target_language: Language to check for
Returns:
True if subtitles exist in target language
"""
if not AV_AVAILABLE or av is None:
logger.warning(f"av (PyAV) not available, cannot check subtitles for {video_file}")
return False
try:
with av.open(video_file) as container:
subtitle_streams = [
stream
for stream in container.streams
if stream.type == "subtitle" and "language" in stream.metadata
]
for stream in subtitle_streams:
stream_language = LanguageCode.from_string(
stream.metadata.get("language", "").lower()
)
if stream_language == target_language:
logger.debug(f"Found subtitles in '{target_language}' in video")
return True
return False
except Exception as e:
logger.error(f"Error checking subtitles: {e}")
return False
def has_subtitle_of_language_in_folder(
video_file: str, target_language: LanguageCode
) -> bool:
"""
Check if external subtitle file exists for video.
Args:
video_file: Path to video file
target_language: Language to check for
Returns:
True if external subtitle exists
"""
subtitle_extensions = {".srt", ".vtt", ".sub", ".ass", ".ssa"}
video_folder = os.path.dirname(video_file)
video_name = os.path.splitext(os.path.basename(video_file))[0]
try:
for file_name in os.listdir(video_folder):
if not any(file_name.endswith(ext) for ext in subtitle_extensions):
continue
subtitle_name, _ = os.path.splitext(file_name)
if not subtitle_name.startswith(video_name):
continue
# Extract language from filename
parts = subtitle_name[len(video_name) :].lstrip(".").split(".")
for part in parts:
if LanguageCode.from_string(part) == target_language:
logger.debug(f"Found external subtitle: {file_name}")
return True
return False
except Exception as e:
logger.error(f"Error checking external subtitles: {e}")
return False
def handle_multiple_audio_tracks(
file_path: str, language: Optional[LanguageCode] = None
) -> Optional[BytesIO]:
"""
Handle files with multiple audio tracks.
Args:
file_path: Path to media file
language: Preferred language
Returns:
BytesIO with extracted audio or None
"""
audio_tracks = get_audio_tracks(file_path)
if len(audio_tracks) <= 1:
return None
logger.debug(f"Handling {len(audio_tracks)} audio tracks")
# Find track by language
audio_track = None
if language:
for track in audio_tracks:
if track["language"] == language:
audio_track = track
break
# Fallback to first track
if not audio_track:
audio_track = audio_tracks[0]
return extract_audio_track_to_memory(file_path, audio_track["index"])