Merge pull request #66 from McCloudS/RemoveTransformers

Remove transformers
2024-03-21 09:40:58 -06:00
parent 2a33b8ad9d 3241960e5f
commit bfaed2d819
3 changed files with 6 additions and 29 deletions
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 <details>
 <summary>Updates:</summary>

-21 Mar 2024: Added a 'wizard' into the launcher that will help standalone users get common Bazarr variables configured.  See below in Launcher section.
+21 Mar 2024: Added a 'wizard' into the launcher that will help standalone users get common Bazarr variables configured.  See below in Launcher section.  Removed 'Transformers' as an option.  While I usually don't like to remove features, I don't think anyone is using this and the results are wildly unpredictable and often cause out of memory errors.  

 19 Mar 2024: Added a `MONITOR` environment variable.  Will 'watch' or 'monitor' your `TRANSCRIBE_FOLDERS` for changes and run on them.  Useful if you just want to paste files into a folder and get subtitles.   

@@ -169,8 +169,6 @@ The following environment variables are available in Docker.  They will default
 | DEBUG                     | True                  | Provides some debug data that can be helpful to troubleshoot path mapping and other issues. Fun fact, if this is set to true, any modifications to the script will auto-reload it (if it isn't actively transcoding).  Useful to make small tweaks without re-downloading the whole file. |
 | FORCE_DETECTED_LANGUAGE_TO | '' | This is to force the model to a language instead of the detected one, takes a 2 letter language code.  For example, your audio is French but keeps detecting as English, you would set it to 'fr' |
 | CLEAR_VRAM_ON_COMPLETE | True | This will delete the model and do garbage collection when queue is empty.  Good if you need to use the VRAM for something else. |
-| HF_TRANSFORMERS | False | Uses Hugging Face Transformers models that should be faster, not tested as of now because HF is down. |
-| HF_BATCH_SIZE | 24 | Batch size to be used with above.  Batch size has a correlation to VRAM, not sure what it is yet and may require tinkering.  
 | UPDATE | False | Will pull latest subgen.py from the repository if True.  False will use the original subgen.py built into the Docker image.  Standalone users can use this with launcher.py to get updates. |
 | APPEND | False | Will add the following at the end of a subtitle: "Transcribed by whisperAI with faster-whisper ({whisper_model}) on {datetime.now()}"
 | MONITOR | False | Will monitor `TRANSCRIBE_FOLDERS` for real-time changes to see if we need to generate subtitles |
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,7 +7,4 @@ uvicorn
 python-multipart
 python-ffmpeg
 whisper
-transformers
-accelerate
-optimum
 watchdog
--- a/subgen.py
+++ b/subgen.py
@@ -53,8 +53,6 @@ monitor = convert_to_bool(os.getenv('MONITOR', False))
 transcribe_folders = os.getenv('TRANSCRIBE_FOLDERS', '')
 transcribe_or_translate = os.getenv('TRANSCRIBE_OR_TRANSLATE', 'transcribe')
 force_detected_language_to = os.getenv('FORCE_DETECTED_LANGUAGE_TO', '')
-hf_transformers = convert_to_bool(os.getenv('HF_TRANSFORMERS', False))
-hf_batch_size = int(os.getenv('HF_BATCH_SIZE', 24))
 clear_vram_on_complete = convert_to_bool(os.getenv('CLEAR_VRAM_ON_COMPLETE', True))
 compute_type = os.getenv('COMPUTE_TYPE', 'auto')
 append = convert_to_bool(os.getenv('APPEND', False))
@@ -334,10 +332,7 @@ def asr(
        start_model()
        files_to_transcribe.insert(0, f"Bazarr-asr-{random_name}")
        audio_data = np.frombuffer(audio_file.file.read(), np.int16).flatten().astype(np.float32) / 32768.0
-        if(hf_transformers):
-            result = model.transcribe(audio_data, task=task, input_sr=16000, language=language, batch_size=hf_batch_size, progress_callback=progress)
-        else:
-            result = model.transcribe_stable(audio_data, task=task, input_sr=16000, language=language, progress_callback=progress)
+        result = model.transcribe_stable(audio_data, task=task, input_sr=16000, language=language, progress_callback=progress)
        appendLine(result)
        elapsed_time = time.time() - start_time
        minutes, seconds = divmod(int(elapsed_time), 60)
@@ -370,10 +365,7 @@ def detect_language(
        random_name = random.choices("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890", k=6)
        files_to_transcribe.insert(0, f"Bazarr-detect-language-{random_name}")
        audio_data = np.frombuffer(audio_file.file.read(), np.int16).flatten().astype(np.float32) / 32768.0
-        if(hf_transformers):
-            detected_lang_code = model.transcribe(whisper.pad_or_trim(audio_data), input_sr=16000, batch_size=hf_batch_size).language
-        else:
-            detected_lang_code = model.transcribe_stable(whisper.pad_or_trim(audio_data), input_sr=16000).language
+        detected_lang_code = model.transcribe_stable(whisper.pad_or_trim(audio_data), input_sr=16000).language
            
    except Exception as e:
        logging.info(f"Error processing or transcribing Bazarr {audio_file.filename}: {e}")
@@ -389,11 +381,7 @@ def start_model():
    global model
    if model is None:
        logging.debug("Model was purged, need to re-create")
-        if(hf_transformers):
-            logging.debug("Using Hugging Face Transformers, whisper_threads, concurrent_transcriptions, and model_location variables are ignored!")
-            model = stable_whisper.load_hf_whisper(whisper_model, device=transcribe_device)
-        else:
-            model = stable_whisper.load_faster_whisper(whisper_model, download_root=model_location, device=transcribe_device, cpu_threads=whisper_threads, num_workers=concurrent_transcriptions, compute_type=compute_type)
+        model = stable_whisper.load_faster_whisper(whisper_model, download_root=model_location, device=transcribe_device, cpu_threads=whisper_threads, num_workers=concurrent_transcriptions, compute_type=compute_type)

 def delete_model():
    if clear_vram_on_complete and len(files_to_transcribe) == 0:
@@ -444,10 +432,7 @@ def gen_subtitles(file_path: str, transcribe_or_translate: str, front=True, forc
            if force_detected_language_to:
                forceLanguage = force_detected_language_to
                logging.info(f"Forcing language to {forceLanguage}")
-            if(hf_transformers):
-                result = model.transcribe(file_path, language=forceLanguage, batch_size=hf_batch_size, task=transcribe_or_translate, progress_callback=progress)
-            else:
-                result = model.transcribe_stable(file_path, language=forceLanguage, task=transcribe_or_translate, progress_callback=progress)
+            result = model.transcribe_stable(file_path, language=forceLanguage, task=transcribe_or_translate, progress_callback=progress)
            appendLine(result)
            result.to_srt_vtt(get_file_name_without_extension(file_path) + subextension, word_level=word_level_highlight)
            elapsed_time = time.time() - start_time
@@ -772,10 +757,7 @@ if __name__ == "__main__":
    logging.info(f"Transcriptions are limited to running {str(concurrent_transcriptions)} at a time")
    logging.info(f"Running {str(whisper_threads)} threads per transcription")
    logging.info(f"Using {transcribe_device} to encode")
-    if hf_transformers:
-        logging.info(f"Using Hugging Face Transformers")
-    else:
-        logging.info(f"Using faster-whisper")
+    logging.info(f"Using faster-whisper")
    if transcribe_folders:
        transcribe_existing(transcribe_folders)
    uvicorn.run("subgen:app", host="0.0.0.0", port=int(webhookport), reload=reload_script_on_change, use_colors=True)