Files
convertions/utils/wavtotext.py
2025-01-31 20:07:02 -05:00

312 lines
5.7 KiB
Python
Executable File

import speech_recognition as sr
import sys
import os
import time
import subprocess
import requests
import zipfile
from pathlib import Path
try:
from vosk import Model, KaldiRecognizer
except ImportError:
vosk_installed = False
else:
vosk_installed = True
VOSK_MODEL_URL = "https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip"
VOSK_MODEL_DIR = os.path.abspath("model")
def download_and_setup_vosk():
"""Download and extract the Vosk model."""
print("Downloading Vosk model... This may take a few minutes.")
response = requests.get(VOSK_MODEL_URL, stream=True)
zip_path = "vosk_model.zip"
with open(zip_path, "wb") as file:
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
print("Extracting Vosk model...")
with zipfile.ZipFile(zip_path, "r") as zip_ref:
zip_ref.extractall(VOSK_MODEL_DIR)
os.remove(zip_path)
print(f"Vosk model downloaded and set up in '{VOSK_MODEL_DIR}'.")
# Verify that the model directory contains necessary files
verify_model_files()
# Add model directory to .gitignore
with open(".gitignore", "a") as gitignore:
gitignore.write(f"\\n# Ignore Vosk model directory\\n{VOSK_MODEL_DIR}\\n")
def verify_model_files():
"""Verify that all required files are present in the model directory."""
required_files = ["conf/model.conf", "am/final.mdl"]
missing_files = [file for file in required_files if not Path(VOSK_MODEL_DIR, file).exists()]
if missing_files:
raise Exception(f"Model file(s) missing: {', '.join(missing_files)}. Re-download the Vosk model manually.")
def convert_to_vosk_compatible_wav(input_file):
"""Convert the WAV file to a Vosk-compatible format using ffmpeg."""
output_file = "converted.wav"
try:
print(f"Converting {input_file} to Vosk-compatible format...")
subprocess.run([
"ffmpeg", "-i", input_file, "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le", output_file
], check=True)
print(f"Converted file saved as {output_file}.")
return output_file
except subprocess.CalledProcessError as e:
print(f"Error converting file: {e}")
return None
def online_wav_to_text(input_file):
recognizer = sr.Recognizer()
with sr.AudioFile(input_file) as source:
print("Processing audio for online recognition...")
audio_data = recognizer.record(source)
for attempt in range(3): # Retry up to 3 times
try:
return recognizer.recognize_google(audio_data)
except sr.RequestError as e:
print(f"API error on attempt {attempt + 1}: {e}")
time.sleep(2 ** attempt) # Exponential backoff
except sr.UnknownValueError:
print("Speech recognition could not understand the audio.")
return None
return None
def offline_wav_to_text(input_file):
import wave # Ensure wave is imported before using offline recognition
model_path = Path(VOSK_MODEL_DIR)
if not model_path.exists():
print("Offline model not found. Would you like to set it up now? [y/N]")
choice = input().strip().lower()
if choice == 'y':
download_and_setup_vosk()
else:
print("Skipping offline setup. Exiting.")
return None
try:
verify_model_files()
except Exception as e:
print(e)
return None
wf = wave.open(input_file, "rb")
if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getframerate() not in [8000, 16000]:
print("Audio file must be WAV format mono PCM.")
wf.close()
return None
model = Model(VOSK_MODEL_DIR)
recognizer = KaldiRecognizer(model, wf.getframerate())
print("Processing audio for offline recognition...")
results = []
while True:
data = wf.readframes(4000)
if len(data) == 0:
break
if recognizer.AcceptWaveform(data):
results.append(recognizer.Result())
wf.close()
# Combine results into a single text
return " ".join([result["text"] for result in map(eval, results)])
def wav_to_text(input_file, output_file):
if not input_file.lower().endswith('.wav'):
raise ValueError("Input file must be a WAV file.")
# Check and convert file format if necessary
converted_file = convert_to_vosk_compatible_wav(input_file)
if not converted_file:
print("File conversion failed. Unable to proceed.")
return False
text = online_wav_to_text(converted_file)
if text is None: # Fallback to offline if online fails
print("Online recognition failed. Switching to offline recognition...")
text = offline_wav_to_text(converted_file)
if text:
with open(output_file, 'w') as f:
f.write(text)
print(f"Transcription completed successfully. Output saved to '{output_file}'.")
return True
print("Transcription failed. Please check the error message above.")
return False
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python wav_to_text.py <input_file.wav> <output_file.txt>")
else:
input_file = sys.argv[1]
output_file = sys.argv[2]
if not output_file.lower().endswith('.txt'):
print("Output file must have a .txt extension.")
elif not os.path.exists(input_file):
print(f"Input file {input_file} does not exist.")
else:
success = wav_to_text(input_file, output_file)
if not success:
sys.exit(1) # Exit with a non-zero status code on failure