Spaces:

qubvel-hf
/

vjepa2-streaming-video-classification

Running on L4

App Files Files Community

qubvel-hf HF Staff commited on Jun 16

Commit

d569c73

1 Parent(s): 9fae04f

initital

Browse files

Files changed (3) hide show

README.md +1 -1
app.py +158 -0
requirements.txt +6 -0

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Vjepa2 Streaming Video Classification
 emoji: 🐠
 colorFrom: blue
 colorTo: indigo

 ---
+title: V-JEPA 2 - Streaming Video Classification
 emoji: 🐠
 colorFrom: blue
 colorTo: indigo

app.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import cv2
+import time
+import torch
+import gradio as gr
+import numpy as np
+from fastrtc import Stream, VideoStreamHandler, AdditionalOutputs
+from transformers import VJEPA2ForVideoClassification, AutoVideoProcessor
+CHECKPOINT = "qubvel-hf/vjepa2-vitl-fpc16-256-ssv2"
+TORCH_DTYPE = torch.float16
+TORCH_DEVICE = "cuda"
+UPDATE_EVERY_N_FRAMES = 64
+def add_text_on_image(image, text):
+    # Add a black background to the text
+    image[:70] = 0
+    line_spacing = 10
+    top_margin = 20
+    font = cv2.FONT_HERSHEY_SIMPLEX
+    font_scale = 0.5
+    thickness = 1
+    color = (255, 255, 255)  # White
+    words = text.split()
+    lines = []
+    current_line = ""
+    img_width = image.shape[1]
+    # Build lines that fit within the image width
+    for word in words:
+        test_line = current_line + (" " if current_line else "") + word
+        (test_width, _), _ = cv2.getTextSize(test_line, font, font_scale, thickness)
+        if test_width > img_width - 20:  # 20 px margin
+            lines.append(current_line)
+            current_line = word
+        else:
+            current_line = test_line
+    if current_line:
+        lines.append(current_line)
+    # Draw each line, centered
+    y = top_margin
+    for line in lines:
+        (line_width, line_height), _ = cv2.getTextSize(line, font, font_scale, thickness)
+        x = (img_width - line_width) // 2
+        cv2.putText(image, line, (x, y + line_height), font, font_scale, color, thickness, cv2.LINE_AA)
+        y += line_height + line_spacing
+    return image
+class RunningFramesCache:
+    def __init__(self, save_every_k_frame: int = 1, max_frames: int = 16):
+        self.save_every_k_frame = save_every_k_frame
+        self.max_frames = max_frames
+        self._frames = []
+    def add_frame(self, frame: np.ndarray):
+        self._frames.append(frame)
+        if len(self._frames) > self.max_frames:
+            self._frames.pop(0)
+    def get_frames(self) -> list[np.ndarray]:
+        return self._frames
+    def __len__(self) -> int:
+        return len(self._frames)
+class RunningResult:
+    def __init__(self, max_predictions: int = 4):
+        self.predictions = []
+        self.max_predictions = max_predictions
+    def add_prediction(self, prediction: str):
+        # add time in a format of HH:MM:SS
+        current_time_formatted = time.strftime("%H:%M:%S", time.gmtime(time.time()))
+        self.predictions.append((current_time_formatted, prediction))
+        if len(self.predictions) > self.max_predictions:
+            self.predictions.pop(0)
+    def get_formatted_predictions(self) -> str:
+        if not self.predictions:
+            return "Starting..."
+        current, *past = self.predictions[::-1]
+        text = f">>> {current[1]}\n\n" + "\n".join([
+            f"[{time_formatted}] {prediction}"
+            for time_formatted, prediction in past
+        ])
+        return text
+    def get_last_prediction(self) -> str:
+        return self.predictions[-1][1] if self.predictions else "Starting..."
+class FrameProcessingCallback:
+    def __init__(self):
+        # Loading model and processor
+        self.model = VJEPA2ForVideoClassification.from_pretrained(
+            CHECKPOINT, torch_dtype=torch.bfloat16
+        ).to(TORCH_DEVICE)
+        self.video_processor = AutoVideoProcessor.from_pretrained(CHECKPOINT)
+        # Init frames cache
+        self.running_frames_cache = RunningFramesCache(
+            save_every_k_frame=128 / self.model.config.frames_per_clip,
+            max_frames=self.model.config.frames_per_clip,
+        )
+        self.running_result = RunningResult(max_predictions=4)
+        self.frame_count = 0
+    def __call__(self, image: np.ndarray):
+        image = np.flip(image, axis=1).copy()
+        self.running_frames_cache.add_frame(image)
+        self.frame_count += 1
+        print(f"Frame {self.frame_count}, n frames: {len(self.running_frames_cache)}")
+        if self.frame_count % UPDATE_EVERY_N_FRAMES == 0 and len(self.running_frames_cache) == self.model.config.frames_per_clip:
+            # Prepare frames for model
+            frames = self.running_frames_cache.get_frames()
+            frames = np.array(frames)
+            inputs = self.video_processor(frames, device=TORCH_DEVICE, return_tensors="pt").to(dtype=TORCH_DTYPE)
+            # Run model
+            with torch.no_grad():
+                logits = self.model(**inputs).logits
+            top_index = logits.argmax(dim=-1).item()
+            class_name = self.model.config.id2label[top_index]
+            self.running_result.add_prediction(class_name)
+        formatted_predictions = self.running_result.get_formatted_predictions()
+        last_prediction = self.running_result.get_last_prediction()
+        image = add_text_on_image(image, last_prediction)
+        return image, AdditionalOutputs(formatted_predictions)
+stream = Stream(
+    handler=VideoStreamHandler(FrameProcessingCallback(), skip_frames=True),
+    modality="video",
+    mode="send-receive",
+    additional_outputs=[gr.TextArea(label="Actions", value="", lines=5)],
+    additional_outputs_handler=lambda _, output: output,
+)
+if __name__ == "__main__":
+    stream.ui.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio
+transformers @ git+https://github.com/huggingface/transformers
+torch
+torchvision
+opencv-python-headless
+fastrtc>=0.0.28