video to text added
Browse files- README.md +2 -1
- app.py +162 -0
- data/asl_classifier.pth +3 -0
- data/asl_landmarks_final.csv +0 -0
- data/letters_seq.mp4 +0 -0
- requirements.txt +8 -0
README.md
CHANGED
@@ -10,4 +10,5 @@ pinned: false
|
|
10 |
short_description: A mini project of sign language conversation
|
11 |
---
|
12 |
|
13 |
-
|
|
|
|
10 |
short_description: A mini project of sign language conversation
|
11 |
---
|
12 |
|
13 |
+
# Bi Directional Sign Language Conversation
|
14 |
+
This is an 50 % web app of real time communication between the deaf / dumb people with a normal person who doesn't know about sign language.
|
app.py
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import pandas as pd
|
4 |
+
from sklearn.preprocessing import LabelEncoder
|
5 |
+
import gradio as gr
|
6 |
+
import cv2
|
7 |
+
import mediapipe as mp
|
8 |
+
import numpy as np
|
9 |
+
from huggingface_hub import spaces
|
10 |
+
|
11 |
+
# Define the ASLClassifier model
|
12 |
+
class ASLClassifier(nn.Module):
|
13 |
+
def __init__(self, input_size=63, hidden_size=256, num_classes=28):
|
14 |
+
super(ASLClassifier, self).__init__()
|
15 |
+
self.fc1 = nn.Linear(input_size, hidden_size)
|
16 |
+
self.bn1 = nn.BatchNorm1d(hidden_size)
|
17 |
+
self.relu1 = nn.ReLU()
|
18 |
+
self.dropout1 = nn.Dropout(0.3)
|
19 |
+
self.fc2 = nn.Linear(hidden_size, hidden_size * 2)
|
20 |
+
self.bn2 = nn.BatchNorm1d(hidden_size * 2)
|
21 |
+
self.relu2 = nn.ReLU()
|
22 |
+
self.dropout2 = nn.Dropout(0.3)
|
23 |
+
self.fc3 = nn.Linear(hidden_size * 2, hidden_size)
|
24 |
+
self.bn3 = nn.BatchNorm1d(hidden_size)
|
25 |
+
self.relu3 = nn.ReLU()
|
26 |
+
self.dropout3 = nn.Dropout(0.3)
|
27 |
+
self.fc4 = nn.Linear(hidden_size, hidden_size // 2)
|
28 |
+
self.bn4 = nn.BatchNorm1d(hidden_size // 2)
|
29 |
+
self.relu4 = nn.ReLU()
|
30 |
+
self.dropout4 = nn.Dropout(0.3)
|
31 |
+
self.fc5 = nn.Linear(hidden_size // 2, num_classes)
|
32 |
+
|
33 |
+
def forward(self, x):
|
34 |
+
x = self.fc1(x)
|
35 |
+
x = self.bn1(x)
|
36 |
+
x = self.relu1(x)
|
37 |
+
x = self.dropout1(x)
|
38 |
+
x = self.fc2(x)
|
39 |
+
x = self.bn2(x)
|
40 |
+
x = self.relu2(x)
|
41 |
+
x = self.dropout2(x)
|
42 |
+
x = self.fc3(x)
|
43 |
+
x = self.bn3(x)
|
44 |
+
x = self.relu3(x)
|
45 |
+
x = self.dropout3(x)
|
46 |
+
x = self.fc4(x)
|
47 |
+
x = self.bn4(x)
|
48 |
+
x = self.relu4(x)
|
49 |
+
x = self.dropout4(x)
|
50 |
+
x = self.fc5(x)
|
51 |
+
return x
|
52 |
+
|
53 |
+
# Load the model and label encoder (CPU initially, GPU handled by decorator)
|
54 |
+
device = torch.device('cpu') # Default to CPU; GPU inference handled by @spaces.GPU
|
55 |
+
model = ASLClassifier().to(device)
|
56 |
+
model.load_state_dict(torch.load('data/asl_classifier.pth', map_location=device))
|
57 |
+
model.eval()
|
58 |
+
|
59 |
+
df = pd.read_csv('data/asl_landmarks_final.csv')
|
60 |
+
label_encoder = LabelEncoder()
|
61 |
+
label_encoder.fit(df['label'].values)
|
62 |
+
|
63 |
+
# Initialize MediaPipe (runs on CPU)
|
64 |
+
mp_hands = mp.solutions.hands
|
65 |
+
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5)
|
66 |
+
mp_drawing = mp.solutions.drawing_utils
|
67 |
+
|
68 |
+
# Prediction function with GPU offloading
|
69 |
+
@spaces.GPU
|
70 |
+
def predict_letter(landmarks, model, label_encoder):
|
71 |
+
with torch.no_grad():
|
72 |
+
# Move to GPU for inference (handled by decorator)
|
73 |
+
landmarks = torch.tensor(landmarks, dtype=torch.float32).unsqueeze(0).to('cuda')
|
74 |
+
model = model.to('cuda')
|
75 |
+
output = model(landmarks)
|
76 |
+
_, predicted_idx = torch.max(output, 1)
|
77 |
+
letter = label_encoder.inverse_transform([predicted_idx.item()])[0]
|
78 |
+
# Move model back to CPU to free GPU memory
|
79 |
+
model = model.to('cpu')
|
80 |
+
return letter
|
81 |
+
|
82 |
+
# Video processing function (CPU for video processing, GPU for prediction)
|
83 |
+
def process_video(video_path):
|
84 |
+
# Open video file
|
85 |
+
cap = cv2.VideoCapture(video_path)
|
86 |
+
if not cap.isOpened():
|
87 |
+
return None, "Error: Could not open video."
|
88 |
+
|
89 |
+
# Variables to store output
|
90 |
+
text_output = ""
|
91 |
+
out_frames = []
|
92 |
+
|
93 |
+
while cap.isOpened():
|
94 |
+
ret, frame = cap.read()
|
95 |
+
if not ret:
|
96 |
+
break
|
97 |
+
|
98 |
+
# Process frame with MediaPipe (CPU)
|
99 |
+
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
100 |
+
results = hands.process(frame_rgb)
|
101 |
+
|
102 |
+
if results.multi_hand_landmarks:
|
103 |
+
for hand_landmarks in results.multi_hand_landmarks:
|
104 |
+
# Draw landmarks
|
105 |
+
mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
|
106 |
+
|
107 |
+
# Extract landmarks and predict (GPU via decorator)
|
108 |
+
landmarks = []
|
109 |
+
for lm in hand_landmarks.landmark:
|
110 |
+
landmarks.extend([lm.x, lm.y, lm.z])
|
111 |
+
landmarks = np.array(landmarks, dtype=np.float32)
|
112 |
+
predicted_letter = predict_letter(landmarks, model, label_encoder)
|
113 |
+
|
114 |
+
# Add letter to text (avoid duplicates if same as last)
|
115 |
+
if not text_output or predicted_letter != text_output[-1]:
|
116 |
+
text_output += predicted_letter
|
117 |
+
|
118 |
+
# Overlay predicted letter on frame
|
119 |
+
cv2.putText(frame, f"Letter: {predicted_letter}", (10, 30),
|
120 |
+
cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
|
121 |
+
|
122 |
+
# Store processed frame
|
123 |
+
out_frames.append(frame)
|
124 |
+
|
125 |
+
cap.release()
|
126 |
+
|
127 |
+
# Write processed video to a temporary file
|
128 |
+
out_path = "processed_video.mp4"
|
129 |
+
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
130 |
+
out = cv2.VideoWriter(out_path, fourcc, 20.0, (out_frames[0].shape[1], out_frames[0].shape[0]))
|
131 |
+
for frame in out_frames:
|
132 |
+
out.write(frame)
|
133 |
+
out.release()
|
134 |
+
|
135 |
+
return out_path, text_output
|
136 |
+
|
137 |
+
# Create Gradio interface with sample input
|
138 |
+
with gr.Blocks(title="Sign Language Translation") as demo:
|
139 |
+
gr.Markdown("## Sign Language Translation")
|
140 |
+
video_input = gr.Video(label="Input Video", sources=["upload", "webcam"])
|
141 |
+
video_output = gr.Video(label="Processed Video with Landmarks")
|
142 |
+
text_output = gr.Textbox(label="Predicted Text", interactive=False)
|
143 |
+
|
144 |
+
# Button to process video
|
145 |
+
btn = gr.Button("Translate")
|
146 |
+
btn.click(
|
147 |
+
fn=process_video,
|
148 |
+
inputs=video_input,
|
149 |
+
outputs=[video_output, text_output]
|
150 |
+
)
|
151 |
+
|
152 |
+
# Add sample input video
|
153 |
+
gr.Examples(
|
154 |
+
examples=[["data/letter_seq.mp4"]],
|
155 |
+
inputs=[video_input],
|
156 |
+
outputs=[video_output, text_output],
|
157 |
+
fn=process_video,
|
158 |
+
cache_examples=True # Cache the output for faster loading
|
159 |
+
)
|
160 |
+
|
161 |
+
# Launch the app
|
162 |
+
demo.launch()
|
data/asl_classifier.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7e80bfae266cf8a67e10b4918814dab2701e0e5637f7cf8a7798b82917b143f6
|
3 |
+
size 1291050
|
data/asl_landmarks_final.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/letters_seq.mp4
ADDED
Binary file (584 kB). View file
|
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
pandas
|
3 |
+
scikit-learn
|
4 |
+
gradio
|
5 |
+
opencv-python
|
6 |
+
mediapipe
|
7 |
+
numpy
|
8 |
+
huggingface_hub
|