import speech_recognition as sr import sys import os import time import subprocess import requests import zipfile from pathlib import Path try: from vosk import Model, KaldiRecognizer except ImportError: vosk_installed = False else: vosk_installed = True VOSK_MODEL_URL = "https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip" VOSK_MODEL_DIR = os.path.abspath("model") def download_and_setup_vosk(): """Download and extract the Vosk model.""" print("Downloading Vosk model... This may take a few minutes.") response = requests.get(VOSK_MODEL_URL, stream=True) zip_path = "vosk_model.zip" with open(zip_path, "wb") as file: for chunk in response.iter_content(chunk_size=8192): file.write(chunk) print("Extracting Vosk model...") with zipfile.ZipFile(zip_path, "r") as zip_ref: zip_ref.extractall(VOSK_MODEL_DIR) os.remove(zip_path) print(f"Vosk model downloaded and set up in '{VOSK_MODEL_DIR}'.") # Verify that the model directory contains necessary files verify_model_files() # Add model directory to .gitignore with open(".gitignore", "a") as gitignore: gitignore.write(f"\\n# Ignore Vosk model directory\\n{VOSK_MODEL_DIR}\\n") def verify_model_files(): """Verify that all required files are present in the model directory.""" required_files = ["conf/model.conf", "am/final.mdl"] missing_files = [file for file in required_files if not Path(VOSK_MODEL_DIR, file).exists()] if missing_files: raise Exception(f"Model file(s) missing: {', '.join(missing_files)}. Re-download the Vosk model manually.") def convert_to_vosk_compatible_wav(input_file): """Convert the WAV file to a Vosk-compatible format using ffmpeg.""" output_file = "converted.wav" try: print(f"Converting {input_file} to Vosk-compatible format...") subprocess.run([ "ffmpeg", "-i", input_file, "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le", output_file ], check=True) print(f"Converted file saved as {output_file}.") return output_file except subprocess.CalledProcessError as e: print(f"Error converting file: {e}") return None def online_wav_to_text(input_file): recognizer = sr.Recognizer() with sr.AudioFile(input_file) as source: print("Processing audio for online recognition...") audio_data = recognizer.record(source) for attempt in range(3): # Retry up to 3 times try: return recognizer.recognize_google(audio_data) except sr.RequestError as e: print(f"API error on attempt {attempt + 1}: {e}") time.sleep(2 ** attempt) # Exponential backoff except sr.UnknownValueError: print("Speech recognition could not understand the audio.") return None return None def offline_wav_to_text(input_file): import wave # Ensure wave is imported before using offline recognition model_path = Path(VOSK_MODEL_DIR) if not model_path.exists(): print("Offline model not found. Would you like to set it up now? [y/N]") choice = input().strip().lower() if choice == 'y': download_and_setup_vosk() else: print("Skipping offline setup. Exiting.") return None try: verify_model_files() except Exception as e: print(e) return None wf = wave.open(input_file, "rb") if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getframerate() not in [8000, 16000]: print("Audio file must be WAV format mono PCM.") wf.close() return None model = Model(VOSK_MODEL_DIR) recognizer = KaldiRecognizer(model, wf.getframerate()) print("Processing audio for offline recognition...") results = [] while True: data = wf.readframes(4000) if len(data) == 0: break if recognizer.AcceptWaveform(data): results.append(recognizer.Result()) wf.close() # Combine results into a single text return " ".join([result["text"] for result in map(eval, results)]) def wav_to_text(input_file, output_file): if not input_file.lower().endswith('.wav'): raise ValueError("Input file must be a WAV file.") # Check and convert file format if necessary converted_file = convert_to_vosk_compatible_wav(input_file) if not converted_file: print("File conversion failed. Unable to proceed.") return False text = online_wav_to_text(converted_file) if text is None: # Fallback to offline if online fails print("Online recognition failed. Switching to offline recognition...") text = offline_wav_to_text(converted_file) if text: with open(output_file, 'w') as f: f.write(text) print(f"Transcription completed successfully. Output saved to '{output_file}'.") return True print("Transcription failed. Please check the error message above.") return False if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: python wav_to_text.py ") else: input_file = sys.argv[1] output_file = sys.argv[2] if not output_file.lower().endswith('.txt'): print("Output file must have a .txt extension.") elif not os.path.exists(input_file): print(f"Input file {input_file} does not exist.") else: success = wav_to_text(input_file, output_file) if not success: sys.exit(1) # Exit with a non-zero status code on failure