import cv2 import torch import numpy as np from transformers import DPTForDepthEstimation, DPTImageProcessor import gradio as gr device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = DPTForDepthEstimation.from_pretrained("Intel/dpt-swinv2-tiny-256", torch_dtype=torch.float16).to(device) processor = DPTImageProcessor.from_pretrained("Intel/dpt-swinv2-tiny-256") color_map = cv2.applyColorMap(np.arange(256, dtype=np.uint8), cv2.COLORMAP_INFERNO) input_tensor = torch.zeros((1, 3, 128, 128), dtype=torch.float16, device=device) depth_map = np.zeros((128, 128), dtype=np.float32) depth_map_colored = np.zeros((128, 128, 3), dtype=np.uint8) def preprocess_image(image): return cv2.resize(image, (128, 128), interpolation=cv2.INTER_AREA).transpose(2, 0, 1).astype(np.float32) / 255.0 @torch.inference_mode() def process_frame(image): preprocessed = preprocess_image(image) input_tensor[0] = torch.from_numpy(preprocessed).to(device) if torch.cuda.is_available(): torch.cuda.synchronize() predicted_depth = model(input_tensor).predicted_depth np.subtract(predicted_depth.squeeze().cpu().numpy(), predicted_depth.min().item(), out=depth_map) np.divide(depth_map, depth_map.max(), out=depth_map) np.multiply(depth_map, 255, out=depth_map) depth_map = depth_map.astype(np.uint8) cv2.applyColorMap(depth_map, color_map, dst=depth_map_colored) return depth_map_colored interface = gr.Interface( fn=process_frame, inputs=gr.Image(source="webcam", streaming=True), outputs="image", live=True, refresh_rate=0.1 ) interface.launch()