Spaces:

prakashp1893
/

pronunce-api

Runtime error

App Files Files Community

prakashp1893 commited on 9 days ago

Commit

9077cd6

verified ·

1 Parent(s): 6c42c01

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -22

app.py CHANGED Viewed

@@ -7,50 +7,60 @@ import soundfile as sf
 import os
 from pydub import AudioSegment
 # Initialize the FastAPI app
 app = FastAPI()
-# Load the pre-trained model and processor
 model_name = "facebook/wav2vec2-lv-60-espeak-cv-ft"
-processor = Wav2Vec2Processor.from_pretrained(model_name)
-model = Wav2Vec2ForCTC.from_pretrained(model_name)
 # Ensure the model is in evaluation mode
 model.eval()
 # Function to convert audio to the required format
 def convert_audio(audio_bytes):
-    # Load audio from bytes
-    audio = AudioSegment.from_file(io.BytesIO(audio_bytes))
-    # Set to mono
-    audio = audio.set_channels(1)
-    # Set sample rate to 16kHz
-    audio = audio.set_frame_rate(16000)
-    # Export to a buffer
-    buffer = io.BytesIO()
-    audio.export(buffer, format="wav")
-    buffer.seek(0)
-    return buffer.read()
 @app.post("/assess-pronunciation/")
 async def assess_pronunciation(audio_file: UploadFile = File(...)):
     """
-    This endpoint takes an audio file and returns the recognized phonemes.
     """
     # Read the audio file content
     audio_bytes = await audio_file.read()
-    # Convert audio to the model's required format (16kHz, mono)
-    processed_audio_bytes = convert_audio(audio_bytes)
-    # Load the waveform and sample rate from the processed audio bytes
-    waveform, sample_rate = sf.read(io.BytesIO(processed_audio_bytes), dtype='float32')
-    # Ensure the audio is a 1D tensor
-    if waveform.ndim > 1:
-        waveform = waveform.mean(axis=1)
     # Process the audio waveform
     input_values = processor(waveform, sampling_rate=sample_rate, return_tensors="pt", padding="longest").input_values
@@ -63,6 +73,7 @@ async def assess_pronunciation(audio_file: UploadFile = File(...)):
     predicted_ids = torch.argmax(logits, dim=-1)
     transcription = processor.batch_decode(predicted_ids)
     return {"phoneme_transcription": transcription[0]}
 @app.get("/")

 import os
 from pydub import AudioSegment
+# --- FIX: Define a local cache directory ---
+# This tells transformers to download models here, inside our app's folder,
+# instead of the restricted '/.cache' directory.
+CACHE_DIR = "/code/cache"
+os.makedirs(CACHE_DIR, exist_ok=True)
 # Initialize the FastAPI app
 app = FastAPI()
+# --- FIX: Load model and processor using the local cache_dir ---
 model_name = "facebook/wav2vec2-lv-60-espeak-cv-ft"
+processor = Wav2Vec2Processor.from_pretrained(model_name, cache_dir=CACHE_DIR)
+model = Wav2Vec2ForCTC.from_pretrained(model_name, cache_dir=CACHE_DIR)
 # Ensure the model is in evaluation mode
 model.eval()
 # Function to convert audio to the required format
 def convert_audio(audio_bytes):
+    try:
+        # Load audio from bytes using pydub
+        audio = AudioSegment.from_file(io.BytesIO(audio_bytes))
+        # Set to mono
+        audio = audio.set_channels(1)
+        # Set sample rate to 16kHz
+        audio = audio.set_frame_rate(16000)
+        # Export to a buffer in WAV format
+        buffer = io.BytesIO()
+        audio.export(buffer, format="wav")
+        buffer.seek(0)
+        return buffer.read()
+    except Exception as e:
+        # This will catch errors if ffmpeg has issues with a specific file
+        raise ValueError(f"Error processing audio file: {e}")
 @app.post("/assess-pronunciation/")
 async def assess_pronunciation(audio_file: UploadFile = File(...)):
     """
+    This endpoint takes an audio file, converts it, and returns the recognized phonemes.
     """
     # Read the audio file content
     audio_bytes = await audio_file.read()
+    # Convert audio to the model's required format (16kHz, mono WAV)
+    try:
+        processed_audio_bytes = convert_audio(audio_bytes)
+    except ValueError as e:
+        return {"error": str(e)}
+    # Load the waveform from the processed audio bytes
+    waveform, sample_rate = sf.read(io.BytesIO(processed_audio_bytes), dtype='float32')
     # Process the audio waveform
     input_values = processor(waveform, sampling_rate=sample_rate, return_tensors="pt", padding="longest").input_values
     predicted_ids = torch.argmax(logits, dim=-1)
     transcription = processor.batch_decode(predicted_ids)
+    # The output is a list with one item, so we return the item itself
     return {"phoneme_transcription": transcription[0]}
 @app.get("/")