feat(scanning): add library scanner with rules engine

- Add ScanRule model with configurable conditions - Add FileAnalyzer for ffprobe-based media analysis - Add LibraryScanner with manual, scheduled and watcher modes - Add LanguageDetector for audio language detection - Support rule-based filtering with priority evaluation
2026-01-16 16:54:41 +01:00
parent 58c565cd96
commit d28c4caa6a
6 changed files with 1612 additions and 0 deletions
--- a/backend/scanning/library_scanner.py
+++ b/backend/scanning/library_scanner.py
@@ -0,0 +1,894 @@
+"""Library scanner with rule-based filtering and scheduling."""
+import logging
+import os
+import time
+from typing import List, Optional, Dict
+from datetime import datetime, timezone
+from pathlib import Path
+
+from apscheduler.schedulers.background import BackgroundScheduler
+from watchdog.observers import Observer
+from watchdog.events import FileSystemEventHandler, FileCreatedEvent
+
+from backend.core.database import database
+from backend.core.queue_manager import queue_manager
+from backend.core.models import QualityPreset
+from backend.scanning.models import ScanRule
+from backend.scanning.file_analyzer import FileAnalyzer, FileAnalysis
+from backend.scanning.language_detector import language_detector
+from backend.core.language_code import LanguageCode
+
+logger = logging.getLogger(__name__)
+
+
+class LibraryFileHandler(FileSystemEventHandler):
+    """Watchdog handler for real-time file detection."""
+
+    def __init__(self, scanner: "LibraryScanner"):
+        """
+        Initialize file handler.
+
+        Args:
+            scanner: Parent LibraryScanner instance
+        """
+        super().__init__()
+        self.scanner = scanner
+
+    def on_created(self, event: FileCreatedEvent):
+        """
+        Handle new file creation.
+
+        Args:
+            event: File creation event
+        """
+        if event.is_directory:
+            return
+
+        file_path = event.src_path
+
+        # Check if it's a video file
+        if not FileAnalyzer.is_video_file(file_path):
+            return
+
+        # Wait a bit for file to be fully written
+        time.sleep(5)
+
+        logger.info(f"New file detected: {file_path}")
+        self.scanner.process_file(file_path)
+
+
+class LibraryScanner:
+    """
+    Library scanner with rule-based filtering.
+
+    Scans media libraries, analyzes files with ffprobe, and applies
+    configurable rules to determine which files need transcription.
+
+    Supports:
+    - One-time manual scans
+    - Scheduled periodic scans (cron-like)
+    - Real-time file watching (Tdarr-style)
+    """
+
+    def __init__(self):
+        """Initialize library scanner."""
+        self.scheduler: Optional[BackgroundScheduler] = None
+        self.file_observer: Optional[Observer] = None
+        self.is_scanning = False
+        self.last_scan_time: Optional[datetime] = None
+        self.files_scanned = 0
+        self.files_queued = 0
+
+        logger.info("LibraryScanner initialized")
+
+    def scan_libraries(self, paths: Optional[List[str]] = None) -> Dict:
+        """
+        Perform a one-time scan of library directories.
+
+        Args:
+            paths: List of directories to scan (uses config if None)
+
+        Returns:
+            Dictionary with scan statistics
+        """
+        if self.is_scanning:
+            logger.warning("Scan already in progress")
+            return {"error": "Scan already in progress"}
+
+        self.is_scanning = True
+        self.files_scanned = 0
+        self.files_queued = 0
+        scan_start = time.time()
+
+        try:
+            # Get paths from settings_service if not provided
+            if paths is None:
+                from backend.core.settings_service import settings_service
+                library_paths = settings_service.get('library_paths', '')
+                if not library_paths:
+                    logger.error("No library paths configured")
+                    return {"error": "No library paths configured"}
+                # Handle both comma and pipe separators
+                if '|' in library_paths:
+                    paths = [p.strip() for p in library_paths.split("|") if p.strip()]
+                else:
+                    paths = [p.strip() for p in library_paths.split(",") if p.strip()]
+
+            logger.info(f"Starting library scan: {len(paths)} paths")
+
+            # Load all enabled rules
+            rules = self._load_scan_rules()
+            logger.info(f"Loaded {len(rules)} enabled scan rules")
+
+            # Scan each path
+            for path in paths:
+                if not os.path.isdir(path):
+                    logger.warning(f"Path not found or not a directory: {path}")
+                    continue
+
+                logger.info(f"Scanning: {path}")
+                self._scan_directory(path, rules)
+
+            scan_duration = time.time() - scan_start
+            self.last_scan_time = datetime.now(timezone.utc)
+            self._persist_scan_stats(files_in_this_scan=self.files_scanned)
+
+            results = {
+                "status": "completed",
+                "files_scanned": self.files_scanned,
+                "files_queued": self.files_queued,
+                "duration_seconds": round(scan_duration, 2),
+                "timestamp": self.last_scan_time.isoformat(),
+            }
+
+            logger.info(
+                f"Scan completed: {self.files_scanned} files scanned, "
+                f"{self.files_queued} jobs queued in {scan_duration:.1f}s"
+            )
+
+            return results
+
+        except Exception as e:
+            logger.error(f"Scan failed: {e}", exc_info=True)
+            return {"error": str(e)}
+
+        finally:
+            self.is_scanning = False
+
+    def _scan_directory(self, directory: str, rules: List[ScanRule]):
+        """
+        Recursively scan a directory.
+
+        Args:
+            directory: Directory path
+            rules: List of scan rules to apply
+        """
+        try:
+            for root, dirs, files in os.walk(directory):
+                for file in files:
+                    file_path = os.path.join(root, file)
+                    self.files_scanned += 1
+
+                    # Process file
+                    self.process_file(file_path, rules)
+
+        except Exception as e:
+            logger.error(f"Error scanning directory {directory}: {e}")
+
+    def process_file(
+        self, file_path: str, rules: Optional[List[ScanRule]] = None
+    ) -> bool:
+        """
+        Process a single file against scan rules.
+
+        Args:
+            file_path: Path to media file
+            rules: Optional list of rules (will load if None)
+
+        Returns:
+            True if job was queued, False otherwise
+        """
+        try:
+            # Analyze file
+            analysis = FileAnalyzer.analyze_file(file_path)
+            if not analysis:
+                return False
+
+            # Check if we need language detection
+            if not analysis.default_audio_language or len(analysis.audio_languages) == 0:
+                logger.info(
+                    f"Audio language unknown for {analysis.file_name}, "
+                    f"queuing language detection job"
+                )
+                return self._queue_language_detection_job(analysis)
+
+            # Load rules if not provided
+            if rules is None:
+                rules = self._load_scan_rules()
+
+            # Evaluate against rules
+            matching_rule = self._evaluate_rules(analysis, rules)
+
+            if matching_rule:
+                # Queue job based on rule
+                return self._queue_job_from_rule(analysis, matching_rule)
+
+            return False
+
+        except Exception as e:
+            logger.error(f"Error processing file {file_path}: {e}")
+            return False
+
+    def _evaluate_rules(
+        self, file_analysis: FileAnalysis, rules: List[ScanRule]
+    ) -> Optional[ScanRule]:
+        """
+        Evaluate file against rules (in priority order).
+
+        Args:
+            file_analysis: File analysis result
+            rules: List of scan rules
+
+        Returns:
+            First matching rule or None
+        """
+        for rule in rules:
+            if self._rule_matches(file_analysis, rule):
+                logger.debug(f"File {file_analysis.file_name} matches rule: {rule.name}")
+                return rule
+
+        return None
+
+    def _rule_matches(self, file_analysis: FileAnalysis, rule: ScanRule) -> bool:
+        """
+        Check if a file matches a scan rule.
+
+        Args:
+            file_analysis: File analysis
+            rule: Scan rule
+
+        Returns:
+            True if all conditions match
+        """
+        # Check if rule has any conditions defined
+        has_conditions = any([
+            rule.file_extension,
+            rule.audio_language_is,
+            rule.audio_language_not,
+            rule.audio_track_count_min,
+            rule.has_embedded_subtitle_lang,
+            rule.missing_embedded_subtitle_lang,
+            rule.missing_external_subtitle_lang
+        ])
+
+        if not has_conditions:
+            logger.warning(
+                f"Rule '{rule.name}' has no conditions - will match ALL files. "
+                f"This is probably not what you want!"
+            )
+
+        # Check file extension filter
+        if rule.file_extension:
+            if file_analysis.file_extension not in rule.file_extension_list:
+                return False
+
+        # Check audio language IS
+        if rule.audio_language_is:
+            target_lang = LanguageCode.from_string(rule.audio_language_is)
+
+            # Check if file has the target language
+            has_target_lang = target_lang in file_analysis.audio_languages
+
+            # Also check if file has undefined language (None) - will need detection
+            has_undefined_lang = None in file_analysis.audio_languages or \
+                                 any(lang is None for lang in file_analysis.audio_languages)
+
+            if not has_target_lang:
+                # If language is undefined, try to detect it with Whisper
+                if has_undefined_lang:
+                    logger.info(
+                        f"File {file_analysis.file_name} has undefined audio language - "
+                        f"attempting detection with Whisper..."
+                    )
+
+                    detected_lang = language_detector.detect_language(file_analysis.file_path)
+
+                    if detected_lang:
+                        logger.info(
+                            f"Detected language for {file_analysis.file_name}: {detected_lang}"
+                        )
+
+                        # Check if detected language matches rule
+                        if detected_lang == target_lang:
+                            logger.info(
+                                f"✓ Detected language '{detected_lang}' matches rule '{rule.name}'"
+                            )
+                            # Update file_analysis with detected language for later use
+                            if file_analysis.audio_tracks:
+                                file_analysis.audio_tracks[0].language = detected_lang
+                            return True  # Continue checking other conditions
+                        else:
+                            logger.debug(
+                                f"Rule '{rule.name}' failed: detected '{detected_lang}' "
+                                f"but expected '{rule.audio_language_is}'"
+                            )
+                            return False
+                    else:
+                        logger.warning(
+                            f"Failed to detect language for {file_analysis.file_name} - skipping"
+                        )
+                        return False
+                else:
+                    # Language is defined but doesn't match
+                    logger.debug(
+                        f"Rule '{rule.name}' audio check failed for {file_analysis.file_name}: "
+                        f"Expected '{rule.audio_language_is}' but found "
+                        f"{[str(lang) if lang else 'und' for lang in file_analysis.audio_languages]}"
+                    )
+                    return False
+
+
+        # Check audio language NOT
+        if rule.audio_language_not:
+            excluded_langs = [
+                LanguageCode.from_string(lang) for lang in rule.audio_language_not_list
+            ]
+            if any(lang in file_analysis.audio_languages for lang in excluded_langs):
+                return False
+
+        # Check minimum audio tracks
+        if rule.audio_track_count_min:
+            if len(file_analysis.audio_tracks) < rule.audio_track_count_min:
+                return False
+
+        # Check HAS embedded subtitle
+        if rule.has_embedded_subtitle_lang:
+            required_lang = LanguageCode.from_string(rule.has_embedded_subtitle_lang)
+            if not file_analysis.has_embedded_subtitle_language(required_lang):
+                return False
+
+        # Check MISSING embedded subtitle
+        if rule.missing_embedded_subtitle_lang:
+            excluded_lang = LanguageCode.from_string(rule.missing_embedded_subtitle_lang)
+            if file_analysis.has_embedded_subtitle_language(excluded_lang):
+                return False
+
+        # Check MISSING external subtitle
+        if rule.missing_external_subtitle_lang:
+            excluded_lang = LanguageCode.from_string(rule.missing_external_subtitle_lang)
+            if file_analysis.has_external_subtitle_language(excluded_lang):
+                return False
+
+        # All conditions matched
+        logger.debug(
+            f"File '{file_analysis.file_name}' matched rule '{rule.name}' "
+            f"(priority: {rule.priority})"
+        )
+        return True
+
+    def _queue_language_detection_job(self, file_analysis: FileAnalysis) -> bool:
+        """
+        Create and queue a language detection job for a file with unknown audio language.
+
+        Args:
+            file_analysis: File analysis
+
+        Returns:
+            True if job was queued successfully
+        """
+        try:
+            from backend.core.models import JobType, JobStatus
+
+            # Check if there's already a completed detection job for this file
+            with database.get_session() as session:
+                from backend.core.models import Job
+                existing_detection = session.query(Job).filter(
+                    Job.file_path == file_analysis.file_path,
+                    Job.job_type == JobType.LANGUAGE_DETECTION,
+                    Job.status == JobStatus.COMPLETED
+                ).first()
+
+                if existing_detection:
+                    logger.info(
+                        f"✓ Language already detected for {file_analysis.file_name}, "
+                        f"checking for transcription rules..."
+                    )
+                    # Extract detected language from SRT content
+                    if existing_detection.srt_content:
+                        # Format: "Language detected: ja (Japanese)\nConfidence: 99%"
+                        lines = existing_detection.srt_content.split('\n')
+                        if lines:
+                            lang_line = lines[0]
+                            if 'Language detected:' in lang_line:
+                                lang_code = lang_line.split(':')[1].strip().split(' ')[0]
+                                # Trigger rule checking with detected language
+                                self._check_and_queue_transcription_for_file(
+                                    file_analysis.file_path, lang_code
+                                )
+                    return False
+
+            # Add language detection job with high priority
+            job = queue_manager.add_job(
+                file_path=file_analysis.file_path,
+                file_name=file_analysis.file_name,
+                source_lang=None,  # To be detected
+                target_lang=None,
+                quality_preset=QualityPreset.FAST,
+                priority=15,  # Higher than normal transcription (0-10) but lower than manual (20+)
+                transcribe_or_translate="transcribe",
+                job_type=JobType.LANGUAGE_DETECTION,
+            )
+
+            if job:
+                logger.info(
+                    f"✓ Queued LANGUAGE DETECTION job {job.id} for {file_analysis.file_name}"
+                )
+                self.files_queued += 1
+                return True
+            else:
+                logger.warning(
+                    f"✗ Skipped detection for {file_analysis.file_name}: Job already exists"
+                )
+                return False
+
+        except Exception as e:
+            logger.error(f"Error queuing language detection job: {e}")
+            return False
+
+    def _check_and_queue_transcription_for_file(self, file_path: str, detected_lang_code: str):
+        """
+        Check if a file with detected language matches any scan rules and queue transcription.
+
+        Args:
+            file_path: Path to the file
+            detected_lang_code: Detected language code (ISO 639-1, e.g., 'ja', 'en')
+        """
+        try:
+            logger.info(
+                f"Checking if {file_path} with language '{detected_lang_code}' "
+                f"matches any scan rules..."
+            )
+
+            # Load scan rules
+            rules = self._load_scan_rules()
+            if not rules:
+                logger.debug("No active scan rules found")
+                return
+
+            # Check each rule
+            for rule in rules:
+                # Check if language matches
+                if rule.audio_language_is:
+                    try:
+                        rule_lang = LanguageCode.from_string(rule.audio_language_is)
+                        # Convert detected language (ISO 639-1) to LanguageCode for comparison
+                        detected_lang = LanguageCode.from_iso_639_1(detected_lang_code)
+
+                        if detected_lang != rule_lang:
+                            logger.debug(
+                                f"Rule '{rule.name}' requires language {rule_lang}, "
+                                f"but detected {detected_lang}"
+                            )
+                            continue
+                    except Exception as e:
+                        logger.warning(f"Could not parse rule language code: {e}")
+                        continue
+
+                # Check if language should be excluded
+                if rule.audio_language_not:
+                    excluded_langs = [
+                        LanguageCode.from_string(lang.strip())
+                        for lang in rule.audio_language_not.split(',')
+                    ]
+                    detected_lang_obj = LanguageCode.from_iso_639_1(detected_lang_code)
+                    if detected_lang_obj in excluded_langs:
+                        logger.debug(
+                            f"Rule '{rule.name}' excludes language {detected_lang_code}"
+                        )
+                        continue
+
+                # File matches this rule - queue transcription job
+                logger.info(
+                    f"File {file_path} matches rule '{rule.name}' - queueing transcription job"
+                )
+
+                # Get target language (use ISO 639-1 throughout)
+                target_lang_code = rule.target_language or "eng"
+
+                # Map quality preset
+                quality_map = {
+                    "fast": QualityPreset.FAST,
+                    "balanced": QualityPreset.BALANCED,
+                    "best": QualityPreset.BEST,
+                }
+                quality = quality_map.get(rule.quality_preset, QualityPreset.FAST)
+
+                # Create transcription job
+                # All language codes in ISO 639-1 format (ja, en, es)
+                job = queue_manager.add_job(
+                    file_path=file_path,
+                    file_name=os.path.basename(file_path),
+                    source_lang=detected_lang_code,  # ISO 639-1 (ja, en, es)
+                    target_lang=target_lang_code,    # ISO 639-1 (es, en, fr, etc)
+                    quality_preset=quality,
+                    transcribe_or_translate=rule.action_type or "translate",
+                    priority=rule.job_priority or 5,
+                    is_manual_request=False,
+                )
+
+                if job:
+                    logger.info(
+                        f"✓ Queued transcription job {job.id} for {os.path.basename(file_path)}: "
+                        f"{rule.action_type} {detected_lang_code} → {target_lang_code}"
+                    )
+                    self.files_queued += 1
+
+                # Only queue once (first matching rule)
+                return
+
+            logger.debug(f"File {file_path} does not match any scan rules")
+
+        except Exception as e:
+            logger.error(
+                f"Error checking scan rules for {file_path}: {e}",
+                exc_info=True
+            )
+
+    def _queue_job_from_rule(
+        self, file_analysis: FileAnalysis, rule: ScanRule
+    ) -> bool:
+        """
+        Create and queue a job based on matched rule.
+
+        Args:
+            file_analysis: File analysis
+            rule: Matched scan rule
+
+        Returns:
+            True if job was queued successfully
+        """
+        try:
+            # Map quality preset
+            quality_map = {
+                "fast": QualityPreset.FAST,
+                "balanced": QualityPreset.BALANCED,
+                "best": QualityPreset.BEST,
+            }
+            quality_preset = quality_map.get(rule.quality_preset, QualityPreset.FAST)
+
+            # Determine source language (default audio track)
+            source_lang = file_analysis.default_audio_language
+            source_lang_code = source_lang.to_iso_639_1() if source_lang else None
+
+            # Add job to queue
+            job = queue_manager.add_job(
+                file_path=file_analysis.file_path,
+                file_name=file_analysis.file_name,
+                source_lang=source_lang_code,
+                target_lang=rule.target_language,
+                quality_preset=quality_preset,
+                priority=rule.job_priority,
+                transcribe_or_translate=rule.action_type,
+            )
+
+            if job:
+                logger.info(
+                    f"✓ Queued job {job.id} for {file_analysis.file_name}: "
+                    f"{rule.action_type} {source_lang_code} → {rule.target_language}"
+                )
+                self.files_queued += 1
+                return True
+            else:
+                logger.warning(
+                    f"✗ Skipped {file_analysis.file_name}: Job already exists or in queue "
+                    f"(path: {file_analysis.file_path}, target: {rule.target_language})"
+                )
+                return False
+
+        except Exception as e:
+            logger.error(f"Error queuing job: {e}")
+            return False
+
+    def _load_scan_rules(self) -> List[ScanRule]:
+        """
+        Load enabled scan rules from database.
+
+        Returns:
+            List of enabled rules (sorted by priority)
+        """
+        with database.get_session() as session:
+            rules = (
+                session.query(ScanRule)
+                .filter(ScanRule.enabled == True)
+                .order_by(ScanRule.priority.desc(), ScanRule.id)
+                .all()
+            )
+            # Expunge rules from session so they can be used outside the context
+            for rule in rules:
+                session.expunge(rule)
+            return rules
+
+    def _persist_scan_stats(self, files_in_this_scan: int = 0):
+        """
+        Persist scan statistics to database for persistence across restarts.
+
+        Args:
+            files_in_this_scan: Number of files scanned in the current scan operation
+        """
+        from backend.core.settings_service import settings_service
+
+        try:
+            # Save last scan time
+            if self.last_scan_time:
+                settings_service.set(
+                    'scanner_last_scan_time',
+                    self.last_scan_time.isoformat(),
+                    category='scanner'
+                )
+
+            # Increment scan count
+            scan_count = settings_service.get('scanner_scan_count', 0)
+            try:
+                scan_count = int(scan_count)
+            except (ValueError, TypeError):
+                scan_count = 0
+
+            scan_count += 1
+            settings_service.set(
+                'scanner_scan_count',
+                str(scan_count),
+                category='scanner'
+            )
+
+            # Save total files scanned (cumulative)
+            if files_in_this_scan > 0:
+                current_total = settings_service.get('scanner_total_files_scanned', 0)
+                try:
+                    current_total = int(current_total)
+                except (ValueError, TypeError):
+                    current_total = 0
+
+                new_total = current_total + files_in_this_scan
+                settings_service.set(
+                    'scanner_total_files_scanned',
+                    str(new_total),
+                    category='scanner'
+                )
+
+                logger.debug(f"Persisted scan stats: scan_count={scan_count}, last_scan={self.last_scan_time}, total_files={new_total}")
+            else:
+                logger.debug(f"Persisted scan stats: scan_count={scan_count}, last_scan={self.last_scan_time}")
+        except Exception as e:
+            logger.error(f"Failed to persist scan stats: {e}")
+
+    # === Scheduler Methods ===
+
+    def start_scheduler(self, interval_minutes: Optional[int] = None):
+        """
+        Start scheduled periodic scanning.
+
+        Args:
+            interval_minutes: Scan interval (uses config if None)
+        """
+        if self.scheduler and self.scheduler.running:
+            logger.warning("Scheduler already running")
+            return
+
+        from backend.core.settings_service import settings_service
+        interval = interval_minutes or int(settings_service.get('scanner_schedule_interval_minutes', 360))
+
+        self.scheduler = BackgroundScheduler()
+        self.scheduler.add_job(
+            func=self.scan_libraries,
+            trigger="interval",
+            minutes=interval,
+            id="library_scan",
+            name=f"Library scan (every {interval}m)",
+        )
+        self.scheduler.start()
+
+        logger.info(f"Scheduler started: scanning every {interval} minutes")
+
+    def stop_scheduler(self):
+        """Stop scheduled scanning."""
+        if self.scheduler and self.scheduler.running:
+            try:
+                # wait=False to avoid blocking on running jobs
+                self.scheduler.shutdown(wait=False)
+            except Exception as e:
+                logger.warning(f"Error shutting down scheduler: {e}")
+            self.scheduler = None
+            logger.info("Scheduler stopped")
+
+    # === File Watcher Methods ===
+
+    def start_file_watcher(self, paths: Optional[List[str]] = None, recursive: bool = True):
+        """
+        Start real-time file watching.
+
+        Args:
+            paths: Paths to watch (uses config if None)
+            recursive: Whether to watch subdirectories
+        """
+        if self.file_observer:
+            logger.warning("File watcher already running")
+            return
+
+        # Get paths from settings_service if not provided
+        if paths is None:
+            from backend.core.settings_service import settings_service
+            library_paths = settings_service.get('library_paths', '')
+            if not library_paths:
+                logger.error("No library paths configured")
+                return
+            # Handle both comma and pipe separators
+            if '|' in library_paths:
+                paths = [p.strip() for p in library_paths.split("|") if p.strip()]
+            else:
+                paths = [p.strip() for p in library_paths.split(",") if p.strip()]
+
+        self.file_observer = Observer()
+        handler = LibraryFileHandler(self)
+
+        for path in paths:
+            if os.path.isdir(path):
+                self.file_observer.schedule(handler, path, recursive=recursive)
+                logger.info(f"Watching: {path} (recursive={recursive})")
+
+        self.file_observer.start()
+        logger.info("File watcher started")
+
+    def stop_file_watcher(self):
+        """Stop real-time file watching."""
+        if self.file_observer:
+            try:
+                self.file_observer.stop()
+                # Use timeout to avoid blocking indefinitely
+                self.file_observer.join(timeout=5.0)
+            except Exception as e:
+                logger.warning(f"Error stopping file watcher: {e}")
+            self.file_observer = None
+            logger.info("File watcher stopped")
+
+    def get_status(self) -> Dict:
+        """
+        Get scanner status.
+
+        Returns:
+            Dictionary with scanner status
+        """
+        from backend.core.settings_service import settings_service
+
+        watched_paths = []
+        if self.file_observer:
+            # Get watched paths from observer
+            watched_paths = [str(w.path) for w in self.file_observer.emitters]
+
+        next_scan_time = None
+        if self.scheduler and self.scheduler.running:
+            # Get next scheduled job time
+            jobs = self.scheduler.get_jobs()
+            if jobs:
+                next_scan_time = jobs[0].next_run_time.isoformat()
+
+        # Get last_scan_time from database (persisted) or memory (current session)
+        last_scan_time = self.last_scan_time
+        if last_scan_time is None:
+            # Try to load from database
+            db_last_scan = settings_service.get('scanner_last_scan_time')
+            if db_last_scan:
+                try:
+                    last_scan_time = datetime.fromisoformat(db_last_scan)
+                except ValueError:
+                    last_scan_time = None
+
+        # Get scan count from database
+        scan_count = settings_service.get('scanner_scan_count', 0)
+        try:
+            scan_count = int(scan_count)
+        except (ValueError, TypeError):
+            scan_count = 0
+
+        # Get total_files_scanned from database
+        total_files_scanned = settings_service.get('scanner_total_files_scanned', 0)
+        try:
+            total_files_scanned = int(total_files_scanned)
+        except (ValueError, TypeError):
+            total_files_scanned = 0
+
+        return {
+            "scheduler_enabled": self.scheduler is not None,
+            "scheduler_running": self.scheduler is not None and self.scheduler.running,
+            "next_scan_time": next_scan_time,
+            "watcher_enabled": self.file_observer is not None,
+            "watcher_running": self.file_observer is not None,
+            "watched_paths": watched_paths,
+            "last_scan_time": last_scan_time.isoformat() if last_scan_time else None,
+            "total_scans": scan_count,
+            "total_files_scanned": total_files_scanned,
+        }
+
+    def scan_paths(self, paths: List[str], recursive: bool = True) -> Dict:
+        """
+        Scan specific paths.
+
+        Args:
+            paths: List of paths to scan
+            recursive: Whether to scan subdirectories
+
+        Returns:
+            Scan result dictionary
+        """
+        if self.is_scanning:
+            logger.warning("Scan already in progress")
+            return {
+                "scanned_files": 0,
+                "matched_files": 0,
+                "jobs_created": 0,
+                "skipped_files": 0,
+                "paths_scanned": [],
+                "error": "Scan already in progress"
+            }
+
+        self.is_scanning = True
+        scanned = 0
+        matched = 0
+        jobs_created = 0
+        skipped = 0
+
+        try:
+
+            for path in paths:
+                if not os.path.exists(path):
+                    logger.warning(f"Path does not exist: {path}")
+                    continue
+
+                # Scan directory
+                if os.path.isdir(path):
+                    for root, dirs, files in os.walk(path):
+                        for file in files:
+                            file_path = os.path.join(root, file)
+
+                            if not FileAnalyzer.is_video_file(file_path):
+                                continue
+
+                            scanned += 1
+
+                            # Process file
+                            if self.process_file(file_path):
+                                matched += 1
+                                jobs_created += 1
+                            else:
+                                skipped += 1
+
+                        if not recursive:
+                            break
+
+                # Single file
+                elif os.path.isfile(path):
+                    if FileAnalyzer.is_video_file(path):
+                        scanned += 1
+                        if self.process_file(path):
+                            matched += 1
+                            jobs_created += 1
+                        else:
+                            skipped += 1
+
+            self.last_scan_time = datetime.now(timezone.utc)
+            self.files_scanned += scanned
+            self._persist_scan_stats(files_in_this_scan=scanned)
+
+            return {
+                "scanned_files": scanned,
+                "matched_files": matched,
+                "jobs_created": jobs_created,
+                "skipped_files": skipped,
+                "paths_scanned": paths,
+            }
+
+        finally:
+            self.is_scanning = False
+
+
+# Global scanner instance
+library_scanner = LibraryScanner()