Spaces:

prakashp1893
/

pronunce-api

Runtime error

App Files Files Community

prakashp1893 commited on 24 days ago

Commit

556e71e

verified ·

1 Parent(s): 39ea74e

Create app.py

Browse files

Files changed (1) hide show

app.py +70 -0

app.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from fastapi import FastAPI, File, UploadFile
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+import torch
+import torchaudio
+import io
+import soundfile as sf
+import os
+from pydub import AudioSegment
+# Initialize the FastAPI app
+app = FastAPI()
+# Load the pre-trained model and processor
+model_name = "facebook/wav2vec2-lv-60-espeak-cv-ft"
+processor = Wav2Vec2Processor.from_pretrained(model_name)
+model = Wav2Vec2ForCTC.from_pretrained(model_name)
+# Ensure the model is in evaluation mode
+model.eval()
+# Function to convert audio to the required format
+def convert_audio(audio_bytes):
+    # Load audio from bytes
+    audio = AudioSegment.from_file(io.BytesIO(audio_bytes))
+    # Set to mono
+    audio = audio.set_channels(1)
+    # Set sample rate to 16kHz
+    audio = audio.set_frame_rate(16000)
+    # Export to a buffer
+    buffer = io.BytesIO()
+    audio.export(buffer, format="wav")
+    buffer.seek(0)
+    return buffer.read()
+@app.post("/assess-pronunciation/")
+async def assess_pronunciation(audio_file: UploadFile = File(...)):
+    """
+    This endpoint takes an audio file and returns the recognized phonemes.
+    """
+    # Read the audio file content
+    audio_bytes = await audio_file.read()
+    # Convert audio to the model's required format (16kHz, mono)
+    processed_audio_bytes = convert_audio(audio_bytes)
+    # Load the waveform and sample rate from the processed audio bytes
+    waveform, sample_rate = sf.read(io.BytesIO(processed_audio_bytes), dtype='float32')
+    # Ensure the audio is a 1D tensor
+    if waveform.ndim > 1:
+        waveform = waveform.mean(axis=1)
+    # Process the audio waveform
+    input_values = processor(waveform, sampling_rate=sample_rate, return_tensors="pt", padding="longest").input_values
+    # Perform inference
+    with torch.no_grad():
+        logits = model(input_values).logits
+    # Get the predicted IDs and decode them into phonemes
+    predicted_ids = torch.argmax(logits, dim=-1)
+    transcription = processor.batch_decode(predicted_ids)
+    return {"phoneme_transcription": transcription[0]}
+@app.get("/")
+def read_root():
+    return {"message": "Wav2Vec2 Pronunciation Assessment API is running."}