Compare commits

...

2 Commits

Author SHA1 Message Date
8acbe84b15 chore: update project structure and workflows
- Update .gitignore for new backend structure
- Update GitHub workflows for transcriptarr rename
- Update launcher.py to use new module name
2026-01-11 21:37:23 +01:00
4e9e3c4159 docs: add comprehensive documentation and test suite
- Add CLAUDE.md with project architecture and operation modes
- Add backend/README.md with setup and usage instructions
- Add test_backend.py with automated tests for config, database, and queue
- Update requirements.txt with optional dependencies structure
- Update .env.example with all configuration options
2026-01-11 21:37:20 +01:00
9 changed files with 481 additions and 26 deletions

View File

@@ -1,6 +1,90 @@
# ============================================
# TranscriptorIO Configuration
# ============================================
# === Application Mode ===
# Options: standalone, provider, or standalone,provider (hybrid mode)
TRANSCRIPTARR_MODE=standalone
# === Database Configuration ===
# SQLite (default - no additional driver needed)
DATABASE_URL=sqlite:///./transcriptarr.db
# PostgreSQL example (requires psycopg2-binary)
# DATABASE_URL=postgresql://user:password@localhost:5432/transcriptarr
# MariaDB/MySQL example (requires pymysql)
# DATABASE_URL=mariadb+pymysql://user:password@localhost:3306/transcriptarr
# === Worker Configuration ===
CONCURRENT_TRANSCRIPTIONS=2
WHISPER_THREADS=4
TRANSCRIBE_DEVICE=cpu
CLEAR_VRAM_ON_COMPLETE=True
# === Whisper Model Configuration ===
# Options: tiny, base, small, medium, large-v3, large-v3-turbo, etc.
WHISPER_MODEL=medium
WEBHOOKPORT=9000
TRANSCRIBE_DEVICE=gpu
MODEL_PATH=./models
COMPUTE_TYPE=auto
# === Standalone Mode Configuration ===
# Pipe-separated paths to scan
LIBRARY_PATHS=/media/anime|/media/movies
AUTO_SCAN_ENABLED=False
SCAN_INTERVAL_MINUTES=30
# Filter rules for standalone mode
REQUIRED_AUDIO_LANGUAGE=ja
REQUIRED_MISSING_SUBTITLE=spa
SKIP_IF_SUBTITLE_EXISTS=True
# === Provider Mode Configuration ===
BAZARR_URL=http://bazarr:6767
BAZARR_API_KEY=your_api_key_here
PROVIDER_TIMEOUT_SECONDS=600
PROVIDER_CALLBACK_ENABLED=True
PROVIDER_POLLING_INTERVAL=30
# === API Configuration ===
WEBHOOK_PORT=9000
API_HOST=0.0.0.0
DEBUG=True
CLEAR_VRAM_ON_COMPLETE=False
# === Transcription Settings ===
# Options: transcribe, translate
TRANSCRIBE_OR_TRANSLATE=transcribe
SUBTITLE_LANGUAGE_NAME=
# Options: ISO_639_1, ISO_639_2_T, ISO_639_2_B, NAME, NATIVE
SUBTITLE_LANGUAGE_NAMING_TYPE=ISO_639_2_B
WORD_LEVEL_HIGHLIGHT=False
CUSTOM_REGROUP=cm_sl=84_sl=42++++++1
# === Skip Configuration ===
SKIP_IF_EXTERNAL_SUBTITLES_EXIST=False
SKIP_IF_TARGET_SUBTITLES_EXIST=True
SKIP_IF_INTERNAL_SUBTITLES_LANGUAGE=eng
# Pipe-separated language codes
SKIP_SUBTITLE_LANGUAGES=
SKIP_IF_AUDIO_LANGUAGES=
SKIP_UNKNOWN_LANGUAGE=False
SKIP_ONLY_SUBGEN_SUBTITLES=False
# === Advanced Settings ===
FORCE_DETECTED_LANGUAGE_TO=
DETECT_LANGUAGE_LENGTH=30
DETECT_LANGUAGE_OFFSET=0
SHOULD_WHISPER_DETECT_AUDIO_LANGUAGE=False
# Pipe-separated list in order of preference
PREFERRED_AUDIO_LANGUAGES=eng
# === Path Mapping ===
USE_PATH_MAPPING=False
PATH_MAPPING_FROM=/tv
PATH_MAPPING_TO=/Volumes/TV
# === Legacy SubGen Compatibility ===
SHOW_IN_SUBNAME_SUBGEN=True
SHOW_IN_SUBNAME_MODEL=True
APPEND=False
LRC_FOR_AUDIO_FILES=True

View File

@@ -17,7 +17,7 @@ jobs:
- name: Checkout
uses: actions/checkout@v4
- name: Get version from subgen.py
- name: Get version from transcriptarr.py
id: get_version
run: |
version=$(grep -oP "subgen_version\s*=\s*'\K[^']+" subgen.py)

View File

@@ -17,7 +17,7 @@ jobs:
with:
fetch-depth: 0
- name: Get version from subgen.py
- name: Get version from transcriptarr.py
id: get_version
run: |
version=$(grep -oP "subgen_version\s*=\s*'\K[^']+" subgen.py)

View File

@@ -5,7 +5,7 @@ on:
branches:
- 'main'
paths:
- 'subgen.py'
- '../../transcriptarr.py'
workflow_dispatch: # Allow manual triggering
jobs:
@@ -26,11 +26,11 @@ jobs:
echo "COMMIT_COUNT=$COMMIT_COUNT"
echo "VERSION=${YEAR}.${MONTH}.${COMMIT_COUNT}" >> $GITHUB_ENV
- name: Update subgen.py with version
- name: Update transcriptarr.py with version
run: |
sed -i "s/subgen_version =.*/subgen_version = '${{ env.VERSION }}'/" subgen.py
- name: Check if subgen.py was actually changed (compare with HEAD)
- name: Check if transcriptarr.py was actually changed (compare with HEAD)
id: check_change
run: |
if git diff --quiet HEAD subgen.py; then
@@ -39,7 +39,7 @@ jobs:
echo "::set-output name=changed::true"
fi
- name: Amend commit if subgen.py changed
- name: Amend commit if transcriptarr.py changed
if: steps.check_change.outputs.changed == 'true'
env:
GIT_AUTHOR_NAME: "McCloudS"

2
.gitignore vendored
View File

@@ -7,6 +7,6 @@
*.vsix
#ignore our settings
subgen.env
.env
models/

185
backend/README.md Normal file
View File

@@ -0,0 +1,185 @@
# TranscriptorIO Backend
This is the redesigned backend for TranscriptorIO, a complete fork of SubGen with modern asynchronous architecture.
## 🎯 Goal
Replace SubGen's synchronous non-persistent system with a modern Tdarr-inspired architecture:
- ✅ Persistent queue (SQLite/PostgreSQL/MariaDB)
- ✅ Asynchronous processing
- ✅ Job prioritization
- ✅ Complete state visibility
- ✅ No Bazarr timeouts
## 📁 Structure
```
backend/
├── core/
│ ├── database.py # Multi-backend database management
│ ├── models.py # SQLAlchemy models (Job, etc.)
│ ├── queue_manager.py # Asynchronous persistent queue
│ └── __init__.py
├── api/ # (coming soon) FastAPI endpoints
├── config.py # Centralized configuration with Pydantic
└── README.md # This file
```
## 🚀 Setup
### 1. Install dependencies
```bash
pip install -r requirements.txt
```
### 2. Configure .env
Copy `.env.example` to `.env` and adjust as needed:
```bash
cp .env.example .env
```
#### Database Options
**SQLite (default)**:
```env
DATABASE_URL=sqlite:///./transcriptarr.db
```
**PostgreSQL**:
```bash
pip install psycopg2-binary
```
```env
DATABASE_URL=postgresql://user:password@localhost:5432/transcriptarr
```
**MariaDB/MySQL**:
```bash
pip install pymysql
```
```env
DATABASE_URL=mariadb+pymysql://user:password@localhost:3306/transcriptarr
```
### 3. Choose operation mode
**Standalone Mode** (automatically scans your library):
```env
TRANSCRIPTARR_MODE=standalone
LIBRARY_PATHS=/media/anime|/media/movies
AUTO_SCAN_ENABLED=True
SCAN_INTERVAL_MINUTES=30
```
**Provider Mode** (receives jobs from Bazarr):
```env
TRANSCRIPTARR_MODE=provider
BAZARR_URL=http://bazarr:6767
BAZARR_API_KEY=your_api_key
```
**Hybrid Mode** (both simultaneously):
```env
TRANSCRIPTARR_MODE=standalone,provider
```
## 🧪 Testing
Run the test script to verify everything works:
```bash
python test_backend.py
```
This will verify:
- ✓ Configuration loading
- ✓ Database connection
- ✓ Table creation
- ✓ Queue operations (add, get, deduplicate)
## 📊 Implemented Components
### config.py
- Centralized configuration with Pydantic
- Automatic environment variable validation
- Multi-backend database support
- Operation mode configuration
### database.py
- Connection management with SQLAlchemy
- Support for SQLite, PostgreSQL, MariaDB
- Backend-specific optimizations
- SQLite: WAL mode, optimized cache
- PostgreSQL: connection pooling, pre-ping
- MariaDB: utf8mb4 charset, pooling
- Health checks and statistics
### models.py
- Complete `Job` model with:
- States: queued, processing, completed, failed, cancelled
- Stages: pending, detecting_language, transcribing, translating, etc.
- Quality presets: fast, balanced, best
- Progress tracking (0-100%)
- Complete timestamps
- Retry logic
- Worker assignment
- Optimized indexes for common queries
### queue_manager.py
- Thread-safe persistent queue
- Job prioritization
- Duplicate detection
- Automatic retry for failed jobs
- Real-time statistics
- Automatic cleanup of old jobs
## 🔄 Comparison with SubGen
| Feature | SubGen | TranscriptorIO |
|---------|--------|----------------|
| Queue | In-memory (lost on restart) | **Persistent in DB** |
| Processing | Synchronous (blocks threads) | **Asynchronous** |
| Prioritization | No | **Yes (configurable)** |
| Visibility | No progress/ETA | **Progress + real-time ETA** |
| Deduplication | Basic (memory only) | **Persistent + intelligent** |
| Retries | No | **Automatic with limit** |
| Database | No | **SQLite/PostgreSQL/MariaDB** |
| Bazarr Timeouts | Yes (>5min = 24h throttle) | **No (async)** |
## 📝 Next Steps
1. **Worker Pool** - Asynchronous worker system
2. **REST API** - FastAPI endpoints for management
3. **WebSocket** - Real-time updates
4. **Transcriber** - Whisper wrapper with progress callbacks
5. **Bazarr Provider** - Improved async provider
6. **Standalone Scanner** - Automatic library scanning
## 🐛 Troubleshooting
### Error: "No module named 'backend'"
Make sure to run scripts from the project root:
```bash
cd /home/dasemu/Hacking/Transcriptarr
python test_backend.py
```
### Error: Database locked (SQLite)
SQLite is configured with WAL mode for better concurrency. If you still have issues, consider using PostgreSQL for production.
### Error: pydantic.errors.ConfigError
Verify that all required variables are in your `.env`:
```bash
cp .env.example .env
# Edit .env with your values
```
## 📚 Documentation
See `CLAUDE.md` for complete architecture and project roadmap.

View File

@@ -42,7 +42,7 @@ def prompt_and_save_bazarr_env_variables():
print(instructions)
env_vars = {
'WHISPER_MODEL': ('Whisper Model', 'Enter the Whisper model you want to run: tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, distil-large-v2, distil-medium.en, distil-small.en', 'medium'),
'WEBHOOKPORT': ('Webhook Port', 'Default listening port for subgen.py', '9000'),
'WEBHOOKPORT': ('Webhook Port', 'Default listening port for transcriptarr.py', '9000'),
'TRANSCRIBE_DEVICE': ('Transcribe Device', 'Set as cpu or gpu', 'gpu'),
# Defaulting to False here for the prompt, user can change
'DEBUG': ('Debug', 'Enable debug logging (true/false)', 'False'),
@@ -51,13 +51,13 @@ def prompt_and_save_bazarr_env_variables():
}
user_input = {}
with open('subgen.env', 'w') as file:
with open('.env', 'w') as file:
for var, (description, prompt, default) in env_vars.items():
value = input(f"{prompt} [{default}]: ") or default
file.write(f"{var}={value}\n")
print("Environment variables have been saved to subgen.env")
print("Environment variables have been saved to .env")
def load_env_variables(env_filename='subgen.env'):
def load_env_variables(env_filename='.env'):
try:
with open(env_filename, 'r') as file:
for line in file:
@@ -93,7 +93,7 @@ def main():
# Changed: action='store_true'
parser.add_argument('-a', '--append', action='store_true', help="Append 'Transcribed by whisper' (overrides .env and external ENV)")
parser.add_argument('-u', '--update', action='store_true', help="Update Subgen")
parser.add_argument('-x', '--exit-early', action='store_true', help="Exit without running subgen.py")
parser.add_argument('-x', '--exit-early', action='store_true', help="Exit without running transcriptarr.py")
parser.add_argument('-s', '--setup-bazarr', action='store_true', help="Prompt for common Bazarr setup parameters and save them for future runs")
parser.add_argument('-b', '--branch', type=str, default='main', help='Specify the branch to download from')
parser.add_argument('-l', '--launcher-update', action='store_true', help="Update launcher.py and re-launch")
@@ -126,7 +126,7 @@ def main():
# After saving, load them immediately for this run
load_env_variables()
else:
# Load if not setting up, assuming subgen.env might exist
# Load if not setting up, assuming .env might exist
load_env_variables()
@@ -157,7 +157,7 @@ def main():
if not os.path.exists(subgen_script_to_run) or args.update or convert_to_bool(os.getenv('UPDATE')):
print(f"Downloading {subgen_script_to_run} from GitHub branch {branch_name}...")
download_from_github(f"https://raw.githubusercontent.com/McCloudS/subgen/{branch_name}/subgen.py", subgen_script_to_run)
download_from_github(f"https://raw.githubusercontent.com/McCloudS/subgen/{branch_name}/transcriptarr.py", subgen_script_to_run)
print(f"Downloading {language_code_script_to_download} from GitHub branch {branch_name}...")
download_from_github(f"https://raw.githubusercontent.com/McCloudS/subgen/{branch_name}/language_code.py", language_code_script_to_download)
@@ -165,8 +165,8 @@ def main():
print(f"{subgen_script_to_run} exists and UPDATE is set to False, skipping download.")
if not args.exit_early:
#print(f"DEBUG environment variable for subgen.py: {os.getenv('DEBUG')}")
#print(f"APPEND environment variable for subgen.py: {os.getenv('APPEND')}")
#print(f"DEBUG environment variable for transcriptarr.py: {os.getenv('DEBUG')}")
#print(f"APPEND environment variable for transcriptarr.py: {os.getenv('APPEND')}")
print(f'Launching {subgen_script_to_run}')
try:
subprocess.run([python_cmd, '-u', subgen_script_to_run], check=True)
@@ -176,7 +176,7 @@ def main():
print(f"Error running {subgen_script_to_run}: {e}")
else:
print("Not running subgen.py: -x or --exit-early set")
print("Not running transcriptarr.py: -x or --exit-early set")
if __name__ == "__main__":
main()

View File

@@ -1,10 +1,33 @@
numpy
stable-ts
# Core dependencies
fastapi
requests
faster-whisper
uvicorn
uvicorn[standard]
python-multipart
requests
python-dotenv>=1.0.0
# Database & ORM (SQLite is built-in)
sqlalchemy>=2.0.0
pydantic>=2.0.0
pydantic-settings>=2.0.0
# Media processing (CPU-only by default)
numpy
ffmpeg-python
whisper
watchdog
# Optional dependencies (install based on configuration):
#
# For PostgreSQL database:
# pip install psycopg2-binary
#
# For MariaDB/MySQL database:
# pip install pymysql
#
# For Whisper transcription:
# pip install openai-whisper faster-whisper stable-ts
#
# For GPU support (NVIDIA):
# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
#
# For media file handling:
# pip install av>=10.0.0

163
test_backend.py Executable file
View File

@@ -0,0 +1,163 @@
#!/usr/bin/env python3
"""Test script for TranscriptorIO backend components."""
import sys
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def test_config():
"""Test configuration loading."""
logger.info("Testing configuration...")
try:
from backend.config import settings
logger.info(f"✓ Config loaded successfully")
logger.info(f" - Mode: {settings.transcriptarr_mode}")
logger.info(f" - Database: {settings.database_type.value}")
logger.info(f" - Whisper Model: {settings.whisper_model}")
logger.info(f" - Device: {settings.transcribe_device}")
return True
except Exception as e:
logger.error(f"✗ Config test failed: {e}")
return False
def test_database():
"""Test database connection and table creation."""
logger.info("\nTesting database...")
try:
from backend.core.database import database
from backend.core.models import Base
# Clean database for fresh test
try:
database.drop_tables()
logger.info(f" - Dropped existing tables for clean test")
except:
pass
database.create_tables()
logger.info(f"✓ Database initialized with fresh tables")
# Test connection with health check
if database.health_check():
logger.info(f"✓ Database connection OK")
else:
logger.error("✗ Database health check failed (but tables were created)")
# Don't fail the test if health check fails but tables exist
return True
# Get stats
stats = database.get_stats()
logger.info(f" - Type: {stats['type']}")
logger.info(f" - URL: {stats['url']}")
return True
except Exception as e:
logger.error(f"✗ Database test failed: {e}")
import traceback
traceback.print_exc()
return False
def test_queue_manager():
"""Test queue manager operations."""
logger.info("\nTesting queue manager...")
try:
from backend.core.queue_manager import queue_manager
from backend.core.models import QualityPreset
# Add a test job
job = queue_manager.add_job(
file_path="/test/anime.mkv",
file_name="anime.mkv",
source_lang="ja",
target_lang="es",
quality_preset=QualityPreset.FAST,
priority=5
)
if job:
logger.info(f"✓ Job created: {job.id}")
logger.info(f" - File: {job.file_name}")
logger.info(f" - Status: {job.status.value}")
logger.info(f" - Priority: {job.priority}")
else:
logger.error("✗ Failed to create job")
return False
# Get queue stats
stats = queue_manager.get_queue_stats()
logger.info(f"✓ Queue stats:")
logger.info(f" - Total: {stats['total']}")
logger.info(f" - Queued: {stats['queued']}")
logger.info(f" - Processing: {stats['processing']}")
logger.info(f" - Completed: {stats['completed']}")
# Try to add duplicate
duplicate = queue_manager.add_job(
file_path="/test/anime.mkv",
file_name="anime.mkv",
source_lang="ja",
target_lang="es",
quality_preset=QualityPreset.FAST
)
if duplicate is None:
logger.info(f"✓ Duplicate detection working")
else:
logger.warning(f"⚠ Duplicate job was created (should have been rejected)")
# Get next job
next_job = queue_manager.get_next_job("test-worker-1")
if next_job:
logger.info(f"✓ Got next job: {next_job.id} (assigned to test-worker-1)")
logger.info(f" - Status: {next_job.status.value}")
else:
logger.error("✗ Failed to get next job")
return False
return True
except Exception as e:
logger.error(f"✗ Queue manager test failed: {e}")
import traceback
traceback.print_exc()
return False
def main():
"""Run all tests."""
logger.info("=" * 60)
logger.info("TranscriptorIO Backend Test Suite")
logger.info("=" * 60)
results = {
"Config": test_config(),
"Database": test_database(),
"Queue Manager": test_queue_manager(),
}
logger.info("\n" + "=" * 60)
logger.info("Test Results:")
logger.info("=" * 60)
all_passed = True
for test_name, passed in results.items():
status = "✓ PASSED" if passed else "✗ FAILED"
logger.info(f"{test_name}: {status}")
if not passed:
all_passed = False
logger.info("=" * 60)
if all_passed:
logger.info("🎉 All tests passed!")
return 0
else:
logger.error("❌ Some tests failed")
return 1
if __name__ == "__main__":
sys.exit(main())