JaiSurya commited on
Commit
cc56315
·
1 Parent(s): 879783b

video to text added

Browse files
README.md CHANGED
@@ -10,4 +10,5 @@ pinned: false
10
  short_description: A mini project of sign language conversation
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
10
  short_description: A mini project of sign language conversation
11
  ---
12
 
13
+ # Bi Directional Sign Language Conversation
14
+ This is an 50 % web app of real time communication between the deaf / dumb people with a normal person who doesn't know about sign language.
app.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import pandas as pd
4
+ from sklearn.preprocessing import LabelEncoder
5
+ import gradio as gr
6
+ import cv2
7
+ import mediapipe as mp
8
+ import numpy as np
9
+ from huggingface_hub import spaces
10
+
11
+ # Define the ASLClassifier model
12
+ class ASLClassifier(nn.Module):
13
+ def __init__(self, input_size=63, hidden_size=256, num_classes=28):
14
+ super(ASLClassifier, self).__init__()
15
+ self.fc1 = nn.Linear(input_size, hidden_size)
16
+ self.bn1 = nn.BatchNorm1d(hidden_size)
17
+ self.relu1 = nn.ReLU()
18
+ self.dropout1 = nn.Dropout(0.3)
19
+ self.fc2 = nn.Linear(hidden_size, hidden_size * 2)
20
+ self.bn2 = nn.BatchNorm1d(hidden_size * 2)
21
+ self.relu2 = nn.ReLU()
22
+ self.dropout2 = nn.Dropout(0.3)
23
+ self.fc3 = nn.Linear(hidden_size * 2, hidden_size)
24
+ self.bn3 = nn.BatchNorm1d(hidden_size)
25
+ self.relu3 = nn.ReLU()
26
+ self.dropout3 = nn.Dropout(0.3)
27
+ self.fc4 = nn.Linear(hidden_size, hidden_size // 2)
28
+ self.bn4 = nn.BatchNorm1d(hidden_size // 2)
29
+ self.relu4 = nn.ReLU()
30
+ self.dropout4 = nn.Dropout(0.3)
31
+ self.fc5 = nn.Linear(hidden_size // 2, num_classes)
32
+
33
+ def forward(self, x):
34
+ x = self.fc1(x)
35
+ x = self.bn1(x)
36
+ x = self.relu1(x)
37
+ x = self.dropout1(x)
38
+ x = self.fc2(x)
39
+ x = self.bn2(x)
40
+ x = self.relu2(x)
41
+ x = self.dropout2(x)
42
+ x = self.fc3(x)
43
+ x = self.bn3(x)
44
+ x = self.relu3(x)
45
+ x = self.dropout3(x)
46
+ x = self.fc4(x)
47
+ x = self.bn4(x)
48
+ x = self.relu4(x)
49
+ x = self.dropout4(x)
50
+ x = self.fc5(x)
51
+ return x
52
+
53
+ # Load the model and label encoder (CPU initially, GPU handled by decorator)
54
+ device = torch.device('cpu') # Default to CPU; GPU inference handled by @spaces.GPU
55
+ model = ASLClassifier().to(device)
56
+ model.load_state_dict(torch.load('data/asl_classifier.pth', map_location=device))
57
+ model.eval()
58
+
59
+ df = pd.read_csv('data/asl_landmarks_final.csv')
60
+ label_encoder = LabelEncoder()
61
+ label_encoder.fit(df['label'].values)
62
+
63
+ # Initialize MediaPipe (runs on CPU)
64
+ mp_hands = mp.solutions.hands
65
+ hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5)
66
+ mp_drawing = mp.solutions.drawing_utils
67
+
68
+ # Prediction function with GPU offloading
69
+ @spaces.GPU
70
+ def predict_letter(landmarks, model, label_encoder):
71
+ with torch.no_grad():
72
+ # Move to GPU for inference (handled by decorator)
73
+ landmarks = torch.tensor(landmarks, dtype=torch.float32).unsqueeze(0).to('cuda')
74
+ model = model.to('cuda')
75
+ output = model(landmarks)
76
+ _, predicted_idx = torch.max(output, 1)
77
+ letter = label_encoder.inverse_transform([predicted_idx.item()])[0]
78
+ # Move model back to CPU to free GPU memory
79
+ model = model.to('cpu')
80
+ return letter
81
+
82
+ # Video processing function (CPU for video processing, GPU for prediction)
83
+ def process_video(video_path):
84
+ # Open video file
85
+ cap = cv2.VideoCapture(video_path)
86
+ if not cap.isOpened():
87
+ return None, "Error: Could not open video."
88
+
89
+ # Variables to store output
90
+ text_output = ""
91
+ out_frames = []
92
+
93
+ while cap.isOpened():
94
+ ret, frame = cap.read()
95
+ if not ret:
96
+ break
97
+
98
+ # Process frame with MediaPipe (CPU)
99
+ frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
100
+ results = hands.process(frame_rgb)
101
+
102
+ if results.multi_hand_landmarks:
103
+ for hand_landmarks in results.multi_hand_landmarks:
104
+ # Draw landmarks
105
+ mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
106
+
107
+ # Extract landmarks and predict (GPU via decorator)
108
+ landmarks = []
109
+ for lm in hand_landmarks.landmark:
110
+ landmarks.extend([lm.x, lm.y, lm.z])
111
+ landmarks = np.array(landmarks, dtype=np.float32)
112
+ predicted_letter = predict_letter(landmarks, model, label_encoder)
113
+
114
+ # Add letter to text (avoid duplicates if same as last)
115
+ if not text_output or predicted_letter != text_output[-1]:
116
+ text_output += predicted_letter
117
+
118
+ # Overlay predicted letter on frame
119
+ cv2.putText(frame, f"Letter: {predicted_letter}", (10, 30),
120
+ cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
121
+
122
+ # Store processed frame
123
+ out_frames.append(frame)
124
+
125
+ cap.release()
126
+
127
+ # Write processed video to a temporary file
128
+ out_path = "processed_video.mp4"
129
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
130
+ out = cv2.VideoWriter(out_path, fourcc, 20.0, (out_frames[0].shape[1], out_frames[0].shape[0]))
131
+ for frame in out_frames:
132
+ out.write(frame)
133
+ out.release()
134
+
135
+ return out_path, text_output
136
+
137
+ # Create Gradio interface with sample input
138
+ with gr.Blocks(title="Sign Language Translation") as demo:
139
+ gr.Markdown("## Sign Language Translation")
140
+ video_input = gr.Video(label="Input Video", sources=["upload", "webcam"])
141
+ video_output = gr.Video(label="Processed Video with Landmarks")
142
+ text_output = gr.Textbox(label="Predicted Text", interactive=False)
143
+
144
+ # Button to process video
145
+ btn = gr.Button("Translate")
146
+ btn.click(
147
+ fn=process_video,
148
+ inputs=video_input,
149
+ outputs=[video_output, text_output]
150
+ )
151
+
152
+ # Add sample input video
153
+ gr.Examples(
154
+ examples=[["data/letter_seq.mp4"]],
155
+ inputs=[video_input],
156
+ outputs=[video_output, text_output],
157
+ fn=process_video,
158
+ cache_examples=True # Cache the output for faster loading
159
+ )
160
+
161
+ # Launch the app
162
+ demo.launch()
data/asl_classifier.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e80bfae266cf8a67e10b4918814dab2701e0e5637f7cf8a7798b82917b143f6
3
+ size 1291050
data/asl_landmarks_final.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/letters_seq.mp4 ADDED
Binary file (584 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ pandas
3
+ scikit-learn
4
+ gradio
5
+ opencv-python
6
+ mediapipe
7
+ numpy
8
+ huggingface_hub