Spaces:

EXCAI
/

Diffusion-As-Shader

Running on Zero

App Files Files Community

EXCAI commited on Mar 31

Commit

5b2a969

1 Parent(s): 9bd5727

update

Browse files

Files changed (4) hide show

.gitmodules +3 -0
app.py +153 -70
demo.py +88 -21
models/pipelines.py +327 -122

.gitmodules CHANGED Viewed

@@ -1,3 +1,6 @@
 [submodule "submodules/MoGe"]
 	path = submodules/MoGe
 	url = https://github.com/microsoft/MoGe.git

 [submodule "submodules/MoGe"]
 	path = submodules/MoGe
 	url = https://github.com/microsoft/MoGe.git
+[submodule "submodules/vggt"]
+	path = submodules/vggt
+	url = https://github.com/facebookresearch/vggt.git

app.py CHANGED Viewed

@@ -16,6 +16,7 @@ sys.path.append(project_root)
 try:
     sys.path.append(os.path.join(project_root, "submodules/MoGe"))
     os.environ["TOKENIZERS_PARALLELISM"] = "false"
 except:
     print("Warning: MoGe not found, motion transfer will not be applied")
@@ -27,6 +28,8 @@ hf_hub_download(repo_id="EXCAI/Diffusion-As-Shader", filename='spatracker/spaT_f
 from models.pipelines import DiffusionAsShaderPipeline, FirstFrameRepainter, CameraMotionGenerator, ObjectMotionGenerator
 from submodules.MoGe.moge.model import MoGeModel
 # Parse command line arguments
 parser = argparse.ArgumentParser(description="Diffusion as Shader Web UI")
@@ -47,6 +50,7 @@ os.makedirs("outputs", exist_ok=True)
 # Create project tmp directory instead of using system temp
 os.makedirs(os.path.join(project_root, "tmp"), exist_ok=True)
 os.makedirs(os.path.join(project_root, "tmp", "gradio"), exist_ok=True)
 def load_media(media_path, max_frames=49, transform=None):
     """Load video or image frames and convert to tensor
@@ -69,22 +73,52 @@ def load_media(media_path, max_frames=49, transform=None):
     is_video = ext in ['.mp4', '.avi', '.mov']
     if is_video:
-        frames = load_video(media_path)
-        fps = len(frames) / VideoFileClip(media_path).duration
     else:
         # Handle image as single frame
         image = load_image(media_path)
         frames = [image]
         fps = 8  # Default fps for images
-    # Ensure we have exactly max_frames
-    if len(frames) > max_frames:
-        frames = frames[:max_frames]
-    elif len(frames) < max_frames:
-        last_frame = frames[-1]
         while len(frames) < max_frames:
-            frames.append(last_frame.copy())
     # Convert frames to tensor
     video_tensor = torch.stack([transform(frame) for frame in frames])
@@ -131,6 +165,7 @@ def save_uploaded_file(file):
 das_pipeline = None
 moge_model = None
 @spaces.GPU
 def get_das_pipeline():
@@ -147,6 +182,13 @@ def get_moge_model():
         moge_model = MoGeModel.from_pretrained("Ruicheng/moge-vitl").to(das.device)
     return moge_model
 def process_motion_transfer(source, prompt, mt_repaint_option, mt_repaint_image):
     """Process video motion transfer task"""
@@ -154,19 +196,20 @@ def process_motion_transfer(source, prompt, mt_repaint_option, mt_repaint_image)
         # Save uploaded files
         input_video_path = save_uploaded_file(source)
         if input_video_path is None:
-            return None
         print(f"DEBUG: Repaint option: {mt_repaint_option}")
         print(f"DEBUG: Repaint image: {mt_repaint_image}")
         das = get_das_pipeline()
         video_tensor, fps, is_video = load_media(input_video_path)
         if not is_video:
             tracking_method = "moge"
             print("Image input detected, using MoGe for tracking video generation.")
         else:
-            tracking_method = "spatracker"
         repaint_img_tensor = None
         if mt_repaint_image is not None:
@@ -180,7 +223,9 @@ def process_motion_transfer(source, prompt, mt_repaint_option, mt_repaint_image)
                 prompt=prompt,
                 depth_path=None
             )
         tracking_tensor = None
         if tracking_method == "moge":
             moge = get_moge_model()
             infer_result = moge.infer(video_tensor[0].to(das.device))  # [C, H, W] in range [0,1]
@@ -195,32 +240,31 @@ def process_motion_transfer(source, prompt, mt_repaint_option, mt_repaint_image)
             pred_tracks = cam_motion.w2s(pred_tracks_flatten, poses).reshape([video_tensor.shape[0], H, W, 3]) # [T, H, W, 3]
-            _, tracking_tensor = das.visualize_tracking_moge(
                 pred_tracks.cpu().numpy(),
                 infer_result["mask"].cpu().numpy()
             )
             print('Export tracking video via MoGe')
         else:
-            pred_tracks, pred_visibility, T_Firsts = das.generate_tracking_spatracker(video_tensor)
-            _, tracking_tensor = das.visualize_tracking_spatracker(video_tensor, pred_tracks, pred_visibility, T_Firsts)
-            print('Export tracking video via SpaTracker')
         output_path = das.apply_tracking(
             video_tensor=video_tensor,
-            fps=8,
             tracking_tensor=tracking_tensor,
             img_cond_tensor=repaint_img_tensor,
             prompt=prompt,
             checkpoint_path=DEFAULT_MODEL_PATH
         )
-        return output_path
     except Exception as e:
         import traceback
         print(f"Processing failed: {str(e)}\n{traceback.format_exc()}")
-        return None
 def process_camera_control(source, prompt, camera_motion, tracking_method):
     """Process camera control task"""
@@ -228,17 +272,18 @@ def process_camera_control(source, prompt, camera_motion, tracking_method):
         # Save uploaded files
         input_media_path = save_uploaded_file(source)
         if input_media_path is None:
-            return None
         print(f"DEBUG: Camera motion: '{camera_motion}'")
         print(f"DEBUG: Tracking method: '{tracking_method}'")
         das = get_das_pipeline()
         video_tensor, fps, is_video = load_media(input_media_path)
-        if not is_video and tracking_method == "spatracker":
             tracking_method = "moge"
-            print("Image input detected with spatracker selected, switching to MoGe")
         cam_motion = CameraMotionGenerator(camera_motion)
         repaint_img_tensor = None
@@ -267,32 +312,54 @@ def process_camera_control(source, prompt, camera_motion, tracking_method):
             )
             print('Export tracking video via MoGe')
         else:
-            pred_tracks, pred_visibility, T_Firsts = das.generate_tracking_spatracker(video_tensor)
             if camera_motion:
                 poses = cam_motion.get_default_motion() # shape: [49, 4, 4]
-                pred_tracks = cam_motion.apply_motion_on_pts(pred_tracks, poses)
                 print("Camera motion applied")
-            _, tracking_tensor = das.visualize_tracking_spatracker(video_tensor, pred_tracks, pred_visibility, T_Firsts)
-            print('Export tracking video via SpaTracker')
         output_path = das.apply_tracking(
             video_tensor=video_tensor,
-            fps=8,
             tracking_tensor=tracking_tensor,
             img_cond_tensor=repaint_img_tensor,
             prompt=prompt,
             checkpoint_path=DEFAULT_MODEL_PATH
         )
-        return output_path
     except Exception as e:
         import traceback
         print(f"Processing failed: {str(e)}\n{traceback.format_exc()}")
-        return None
 def process_object_manipulation(source, prompt, object_motion, object_mask, tracking_method):
     """Process object manipulation task"""
@@ -300,21 +367,21 @@ def process_object_manipulation(source, prompt, object_motion, object_mask, trac
         # Save uploaded files
         input_image_path = save_uploaded_file(source)
         if input_image_path is None:
-            return None
         object_mask_path = save_uploaded_file(object_mask)
         if object_mask_path is None:
             print("Object mask not provided")
-            return None
         das = get_das_pipeline()
         video_tensor, fps, is_video = load_media(input_image_path)
-        if not is_video and tracking_method == "spatracker":
             tracking_method = "moge"
-            print("Image input detected with spatracker selected, switching to MoGe")
         mask_image = Image.open(object_mask_path).convert('L')
         mask_image = transforms.Resize((480, 720))(mask_image)
         mask = torch.from_numpy(np.array(mask_image) > 127)
@@ -322,10 +389,10 @@ def process_object_manipulation(source, prompt, object_motion, object_mask, trac
         motion_generator = ObjectMotionGenerator(device=das.device)
         repaint_img_tensor = None
         tracking_tensor = None
         if tracking_method == "moge":
             moge = get_moge_model()
             infer_result = moge.infer(video_tensor[0].to(das.device))  # [C, H, W] in range [0,1]
             H, W = infer_result["points"].shape[0:2]
             pred_tracks = infer_result["points"].unsqueeze(0).repeat(49, 1, 1, 1) #[T, H, W, 3]
@@ -342,7 +409,6 @@ def process_object_manipulation(source, prompt, object_motion, object_mask, trac
             poses = torch.eye(4).unsqueeze(0).repeat(49, 1, 1)
             pred_tracks_flatten = pred_tracks.reshape(video_tensor.shape[0], H*W, 3)
             cam_motion = CameraMotionGenerator(None)
             cam_motion.set_intr(infer_result["intrinsics"])
             pred_tracks = cam_motion.w2s(pred_tracks_flatten, poses).reshape([video_tensor.shape[0], H, W, 3]) # [T, H, W, 3]
@@ -353,9 +419,27 @@ def process_object_manipulation(source, prompt, object_motion, object_mask, trac
             )
             print('Export tracking video via MoGe')
         else:
-            pred_tracks, pred_visibility, T_Firsts = das.generate_tracking_spatracker(video_tensor)
             pred_tracks = motion_generator.apply_motion(
                 pred_tracks=pred_tracks.squeeze(),
@@ -363,30 +447,27 @@ def process_object_manipulation(source, prompt, object_motion, object_mask, trac
                 motion_type=object_motion,
                 distance=50,
                 num_frames=49,
-                tracking_method="spatracker"
-            ).unsqueeze(0)
             print(f"Object motion '{object_motion}' applied using provided mask")
-            _, tracking_tensor = das.visualize_tracking_spatracker(video_tensor, pred_tracks, pred_visibility, T_Firsts)
-            print('Export tracking video via SpaTracker')
         output_path = das.apply_tracking(
             video_tensor=video_tensor,
-            fps=8,
             tracking_tensor=tracking_tensor,
             img_cond_tensor=repaint_img_tensor,
             prompt=prompt,
             checkpoint_path=DEFAULT_MODEL_PATH
         )
-        return output_path
     except Exception as e:
         import traceback
         print(f"Processing failed: {str(e)}\n{traceback.format_exc()}")
-        return None
 def process_mesh_animation(source, prompt, tracking_video, ma_repaint_option, ma_repaint_image):
     """Process mesh animation task"""
@@ -394,15 +475,16 @@ def process_mesh_animation(source, prompt, tracking_video, ma_repaint_option, ma
         # Save uploaded files
         input_video_path = save_uploaded_file(source)
         if input_video_path is None:
-            return None
         tracking_video_path = save_uploaded_file(tracking_video)
         if tracking_video_path is None:
-            return None
         das = get_das_pipeline()
         video_tensor, fps, is_video = load_media(input_video_path)
         tracking_tensor, tracking_fps, _ = load_media(tracking_video_path)
         repaint_img_tensor = None
         if ma_repaint_image is not None:
@@ -420,18 +502,18 @@ def process_mesh_animation(source, prompt, tracking_video, ma_repaint_option, ma
         output_path = das.apply_tracking(
             video_tensor=video_tensor,
-            fps=8,
             tracking_tensor=tracking_tensor,
             img_cond_tensor=repaint_img_tensor,
             prompt=prompt,
             checkpoint_path=DEFAULT_MODEL_PATH
         )
-        return output_path
     except Exception as e:
         import traceback
         print(f"Processing failed: {str(e)}\n{traceback.format_exc()}")
-        return None
 # Create Gradio interface with updated layout
 with gr.Blocks(title="Diffusion as Shader") as demo:
@@ -444,6 +526,7 @@ with gr.Blocks(title="Diffusion as Shader") as demo:
     with right_column:
         output_video = gr.Video(label="Generated Video")
     with left_column:
         source = gr.File(label="Source", file_types=["image", "video"])
@@ -479,7 +562,7 @@ with gr.Blocks(title="Diffusion as Shader") as demo:
                         source, common_prompt,
                         mt_repaint_option, mt_repaint_image
                     ],
-                    outputs=[output_video]
                 )
             # Camera Control tab
@@ -597,8 +680,8 @@ with gr.Blocks(title="Diffusion as Shader") as demo:
                 cc_tracking_method = gr.Radio(
                     label="Tracking Method",
-                    choices=["spatracker", "moge"],
-                    value="moge"
                 )
                 # Add run button for Camera Control tab
@@ -611,7 +694,7 @@ with gr.Blocks(title="Diffusion as Shader") as demo:
                         source, common_prompt,
                         cc_camera_motion, cc_tracking_method
                     ],
-                    outputs=[output_video]
                 )
             # Object Manipulation tab
@@ -629,8 +712,8 @@ with gr.Blocks(title="Diffusion as Shader") as demo:
                 )
                 om_tracking_method = gr.Radio(
                     label="Tracking Method",
-                    choices=["spatracker", "moge"],
-                    value="moge"
                 )
                 # Add run button for Object Manipulation tab
@@ -643,7 +726,7 @@ with gr.Blocks(title="Diffusion as Shader") as demo:
                         source, common_prompt,
                         om_object_motion, om_object_mask, om_tracking_method
                     ],
-                    outputs=[output_video]
                 )
             # Animating meshes to video tab
@@ -683,7 +766,7 @@ with gr.Blocks(title="Diffusion as Shader") as demo:
                         source, common_prompt,
                         ma_tracking_video, ma_repaint_option, ma_repaint_image
                     ],
-                    outputs=[output_video]
                 )
 # Launch interface

 try:
     sys.path.append(os.path.join(project_root, "submodules/MoGe"))
+    sys.path.append(os.path.join(project_root, "submodules/vggt"))
     os.environ["TOKENIZERS_PARALLELISM"] = "false"
 except:
     print("Warning: MoGe not found, motion transfer will not be applied")
 from models.pipelines import DiffusionAsShaderPipeline, FirstFrameRepainter, CameraMotionGenerator, ObjectMotionGenerator
 from submodules.MoGe.moge.model import MoGeModel
+from submodules.vggt.vggt.utils.pose_enc import pose_encoding_to_extri_intri
+from submodules.vggt.vggt.models.vggt import VGGT
 # Parse command line arguments
 parser = argparse.ArgumentParser(description="Diffusion as Shader Web UI")
 # Create project tmp directory instead of using system temp
 os.makedirs(os.path.join(project_root, "tmp"), exist_ok=True)
 os.makedirs(os.path.join(project_root, "tmp", "gradio"), exist_ok=True)
 def load_media(media_path, max_frames=49, transform=None):
     """Load video or image frames and convert to tensor
     is_video = ext in ['.mp4', '.avi', '.mov']
     if is_video:
+        # Load video file info
+        video_clip = VideoFileClip(media_path)
+        duration = video_clip.duration
+        original_fps = video_clip.fps
+        # Case 1: Video longer than 6 seconds, sample first 6 seconds + 1 frame
+        if duration > 6.0:
+            sampling_fps = 8  # 8 frames per second
+            frames = load_video(media_path, sampling_fps=sampling_fps, max_frames=max_frames)
+            fps = sampling_fps
+        # Cases 2 and 3: Video shorter than 6 seconds
+        else:
+            # Load all frames
+            frames = load_video(media_path)
+            # Case 2: Total frames less than max_frames, need interpolation
+            if len(frames) < max_frames:
+                fps = len(frames) / duration  # Keep original fps
+                # Evenly interpolate to max_frames
+                indices = np.linspace(0, len(frames) - 1, max_frames)
+                new_frames = []
+                for i in indices:
+                    idx = int(i)
+                    new_frames.append(frames[idx])
+                frames = new_frames
+            # Case 3: Total frames more than max_frames but video less than 6 seconds
+            else:
+                # Evenly sample to max_frames
+                indices = np.linspace(0, len(frames) - 1, max_frames)
+                new_frames = []
+                for i in indices:
+                    idx = int(i)
+                    new_frames.append(frames[idx])
+                frames = new_frames
+                fps = max_frames / duration  # New fps to maintain duration
     else:
         # Handle image as single frame
         image = load_image(media_path)
         frames = [image]
         fps = 8  # Default fps for images
+        # Duplicate frame to max_frames
         while len(frames) < max_frames:
+            frames.append(frames[0].copy())
     # Convert frames to tensor
     video_tensor = torch.stack([transform(frame) for frame in frames])
 das_pipeline = None
 moge_model = None
+vggt_model = None
 @spaces.GPU
 def get_das_pipeline():
         moge_model = MoGeModel.from_pretrained("Ruicheng/moge-vitl").to(das.device)
     return moge_model
+@spaces.GPU
+def get_vggt_model():
+    global vggt_model
+    if vggt_model is None:
+        das = get_das_pipeline()
+        vggt_model = VGGT.from_pretrained("facebook/VGGT-1B").to(das.device)
+    return vggt_model
 def process_motion_transfer(source, prompt, mt_repaint_option, mt_repaint_image):
     """Process video motion transfer task"""
         # Save uploaded files
         input_video_path = save_uploaded_file(source)
         if input_video_path is None:
+            return None, None
         print(f"DEBUG: Repaint option: {mt_repaint_option}")
         print(f"DEBUG: Repaint image: {mt_repaint_image}")
         das = get_das_pipeline()
         video_tensor, fps, is_video = load_media(input_video_path)
+        das.fps = fps  # 设置 das.fps 为 load_media 返回的 fps
         if not is_video:
             tracking_method = "moge"
             print("Image input detected, using MoGe for tracking video generation.")
         else:
+            tracking_method = "cotracker"
         repaint_img_tensor = None
         if mt_repaint_image is not None:
                 prompt=prompt,
                 depth_path=None
             )
         tracking_tensor = None
+        tracking_path = None
         if tracking_method == "moge":
             moge = get_moge_model()
             infer_result = moge.infer(video_tensor[0].to(das.device))  # [C, H, W] in range [0,1]
             pred_tracks = cam_motion.w2s(pred_tracks_flatten, poses).reshape([video_tensor.shape[0], H, W, 3]) # [T, H, W, 3]
+            tracking_path, tracking_tensor = das.visualize_tracking_moge(
                 pred_tracks.cpu().numpy(),
                 infer_result["mask"].cpu().numpy()
             )
             print('Export tracking video via MoGe')
         else:
+            # 使用 cotracker
+            pred_tracks, pred_visibility = das.generate_tracking_cotracker(video_tensor)
+            tracking_path, tracking_tensor = das.visualize_tracking_cotracker(pred_tracks, pred_visibility)
+            print('Export tracking video via cotracker')
         output_path = das.apply_tracking(
             video_tensor=video_tensor,
+            fps=fps,  # 使用 load_media 返回的 fps
             tracking_tensor=tracking_tensor,
             img_cond_tensor=repaint_img_tensor,
             prompt=prompt,
             checkpoint_path=DEFAULT_MODEL_PATH
         )
+        return tracking_path, output_path
     except Exception as e:
         import traceback
         print(f"Processing failed: {str(e)}\n{traceback.format_exc()}")
+        return None, None
 def process_camera_control(source, prompt, camera_motion, tracking_method):
     """Process camera control task"""
         # Save uploaded files
         input_media_path = save_uploaded_file(source)
         if input_media_path is None:
+            return None, None
         print(f"DEBUG: Camera motion: '{camera_motion}'")
         print(f"DEBUG: Tracking method: '{tracking_method}'")
         das = get_das_pipeline()
         video_tensor, fps, is_video = load_media(input_media_path)
+        das.fps = fps  # 设置 das.fps 为 load_media 返回的 fps
+        if not is_video:
             tracking_method = "moge"
+            print("Image input detected, switching to MoGe")
         cam_motion = CameraMotionGenerator(camera_motion)
         repaint_img_tensor = None
             )
             print('Export tracking video via MoGe')
         else:
+            # 使用 cotracker
+            pred_tracks, pred_visibility = das.generate_tracking_cotracker(video_tensor)
+            t, c, h, w = video_tensor.shape
+            new_width = 518
+            new_height = round(h * (new_width / w) / 14) * 14
+            resize_transform = transforms.Resize((new_height, new_width), interpolation=Image.BICUBIC)
+            video_vggt = resize_transform(video_tensor)  # [T, C, H, W]
+            if new_height > 518:
+                start_y = (new_height - 518) // 2
+                video_vggt = video_vggt[:, :, start_y:start_y + 518, :]
+            vggt_model = get_vggt_model()
+            with torch.no_grad():
+                with torch.cuda.amp.autocast(dtype=das.dtype):
+                    video_vggt = video_vggt.unsqueeze(0)  # [1, T, C, H, W]
+                    aggregated_tokens_list, ps_idx = vggt_model.aggregator(video_vggt.to(das.device))
+                    extr, intr = pose_encoding_to_extri_intri(vggt_model.camera_head(aggregated_tokens_list)[-1], video_vggt.shape[-2:])
+            cam_motion.set_intr(intr)
+            cam_motion.set_extr(extr)
             if camera_motion:
                 poses = cam_motion.get_default_motion() # shape: [49, 4, 4]
+                pred_tracks_world = cam_motion.s2w_vggt(pred_tracks, extr, intr)
+                pred_tracks = cam_motion.w2s_vggt(pred_tracks_world, extr, intr, poses) # [T, N, 3]
                 print("Camera motion applied")
+            tracking_path, tracking_tensor = das.visualize_tracking_cotracker(pred_tracks, None)
+            print('Export tracking video via cotracker')
         output_path = das.apply_tracking(
             video_tensor=video_tensor,
+            fps=fps,  # 使用 load_media 返回的 fps
             tracking_tensor=tracking_tensor,
             img_cond_tensor=repaint_img_tensor,
             prompt=prompt,
             checkpoint_path=DEFAULT_MODEL_PATH
         )
+        return tracking_path, output_path
     except Exception as e:
         import traceback
         print(f"Processing failed: {str(e)}\n{traceback.format_exc()}")
+        return None, None
 def process_object_manipulation(source, prompt, object_motion, object_mask, tracking_method):
     """Process object manipulation task"""
         # Save uploaded files
         input_image_path = save_uploaded_file(source)
         if input_image_path is None:
+            return None, None
         object_mask_path = save_uploaded_file(object_mask)
         if object_mask_path is None:
             print("Object mask not provided")
+            return None, None
         das = get_das_pipeline()
         video_tensor, fps, is_video = load_media(input_image_path)
+        das.fps = fps  # 设置 das.fps 为 load_media 返回的 fps
+        if not is_video:
             tracking_method = "moge"
+            print("Image input detected, switching to MoGe")
         mask_image = Image.open(object_mask_path).convert('L')
         mask_image = transforms.Resize((480, 720))(mask_image)
         mask = torch.from_numpy(np.array(mask_image) > 127)
         motion_generator = ObjectMotionGenerator(device=das.device)
         repaint_img_tensor = None
         tracking_tensor = None
         if tracking_method == "moge":
             moge = get_moge_model()
             infer_result = moge.infer(video_tensor[0].to(das.device))  # [C, H, W] in range [0,1]
             H, W = infer_result["points"].shape[0:2]
             pred_tracks = infer_result["points"].unsqueeze(0).repeat(49, 1, 1, 1) #[T, H, W, 3]
             poses = torch.eye(4).unsqueeze(0).repeat(49, 1, 1)
             pred_tracks_flatten = pred_tracks.reshape(video_tensor.shape[0], H*W, 3)
             cam_motion = CameraMotionGenerator(None)
             cam_motion.set_intr(infer_result["intrinsics"])
             pred_tracks = cam_motion.w2s(pred_tracks_flatten, poses).reshape([video_tensor.shape[0], H, W, 3]) # [T, H, W, 3]
             )
             print('Export tracking video via MoGe')
         else:
+            # 使用 cotracker
+            pred_tracks, pred_visibility = das.generate_tracking_cotracker(video_tensor)
+            t, c, h, w = video_tensor.shape
+            new_width = 518
+            new_height = round(h * (new_width / w) / 14) * 14
+            resize_transform = transforms.Resize((new_height, new_width), interpolation=Image.BICUBIC)
+            video_vggt = resize_transform(video_tensor)  # [T, C, H, W]
+            if new_height > 518:
+                start_y = (new_height - 518) // 2
+                video_vggt = video_vggt[:, :, start_y:start_y + 518, :]
+            vggt_model = get_vggt_model()
+            with torch.no_grad():
+                with torch.cuda.amp.autocast(dtype=das.dtype):
+                    video_vggt = video_vggt.unsqueeze(0)  # [1, T, C, H, W]
+                    aggregated_tokens_list, ps_idx = vggt_model.aggregator(video_vggt.to(das.device))
+                    extr, intr = pose_encoding_to_extri_intri(vggt_model.camera_head(aggregated_tokens_list)[-1], video_vggt.shape[-2:])
             pred_tracks = motion_generator.apply_motion(
                 pred_tracks=pred_tracks.squeeze(),
                 motion_type=object_motion,
                 distance=50,
                 num_frames=49,
+                tracking_method="cotracker"
+            )
             print(f"Object motion '{object_motion}' applied using provided mask")
+            tracking_path, tracking_tensor = das.visualize_tracking_cotracker(pred_tracks.unsqueeze(0), None)
+            print('Export tracking video via cotracker')
         output_path = das.apply_tracking(
             video_tensor=video_tensor,
+            fps=fps,  # 使用 load_media 返回的 fps
             tracking_tensor=tracking_tensor,
             img_cond_tensor=repaint_img_tensor,
             prompt=prompt,
             checkpoint_path=DEFAULT_MODEL_PATH
         )
+        return tracking_path, output_path
     except Exception as e:
         import traceback
         print(f"Processing failed: {str(e)}\n{traceback.format_exc()}")
+        return None, None
 def process_mesh_animation(source, prompt, tracking_video, ma_repaint_option, ma_repaint_image):
     """Process mesh animation task"""
         # Save uploaded files
         input_video_path = save_uploaded_file(source)
         if input_video_path is None:
+            return None, None
         tracking_video_path = save_uploaded_file(tracking_video)
         if tracking_video_path is None:
+            return None, None
         das = get_das_pipeline()
         video_tensor, fps, is_video = load_media(input_video_path)
+        das.fps = fps  # 设置 das.fps 为 load_media 返回的 fps
         tracking_tensor, tracking_fps, _ = load_media(tracking_video_path)
         repaint_img_tensor = None
         if ma_repaint_image is not None:
         output_path = das.apply_tracking(
             video_tensor=video_tensor,
+            fps=fps,  # 使用 load_media 返回的 fps
             tracking_tensor=tracking_tensor,
             img_cond_tensor=repaint_img_tensor,
             prompt=prompt,
             checkpoint_path=DEFAULT_MODEL_PATH
         )
+        return tracking_video_path, output_path
     except Exception as e:
         import traceback
         print(f"Processing failed: {str(e)}\n{traceback.format_exc()}")
+        return None, None
 # Create Gradio interface with updated layout
 with gr.Blocks(title="Diffusion as Shader") as demo:
     with right_column:
         output_video = gr.Video(label="Generated Video")
+        tracking_video = gr.Video(label="Tracking Video")
     with left_column:
         source = gr.File(label="Source", file_types=["image", "video"])
                         source, common_prompt,
                         mt_repaint_option, mt_repaint_image
                     ],
+                    outputs=[tracking_video, output_video]
                 )
             # Camera Control tab
                 cc_tracking_method = gr.Radio(
                     label="Tracking Method",
+                    choices=["moge", "cotracker"],
+                    value="cotracker"
                 )
                 # Add run button for Camera Control tab
                         source, common_prompt,
                         cc_camera_motion, cc_tracking_method
                     ],
+                    outputs=[tracking_video, output_video]
                 )
             # Object Manipulation tab
                 )
                 om_tracking_method = gr.Radio(
                     label="Tracking Method",
+                    choices=["moge", "cotracker"],
+                    value="cotracker"
                 )
                 # Add run button for Object Manipulation tab
                         source, common_prompt,
                         om_object_motion, om_object_mask, om_tracking_method
                     ],
+                    outputs=[tracking_video, output_video]
                 )
             # Animating meshes to video tab
                         source, common_prompt,
                         ma_tracking_video, ma_repaint_option, ma_repaint_image
                     ],
+                    outputs=[tracking_video, output_video]
                 )
 # Launch interface

demo.py CHANGED Viewed

@@ -5,6 +5,7 @@ from PIL import Image
 project_root = os.path.dirname(os.path.abspath(__file__))
 try:
     sys.path.append(os.path.join(project_root, "submodules/MoGe"))
     os.environ["TOKENIZERS_PARALLELISM"] = "false"
 except:
     print("Warning: MoGe not found, motion transfer will not be applied")
@@ -18,6 +19,8 @@ from diffusers.utils import load_image, load_video
 from models.pipelines import DiffusionAsShaderPipeline, FirstFrameRepainter, CameraMotionGenerator, ObjectMotionGenerator
 from submodules.MoGe.moge.model import MoGeModel
 def load_media(media_path, max_frames=49, transform=None):
     """Load video or image frames and convert to tensor
@@ -28,7 +31,7 @@ def load_media(media_path, max_frames=49, transform=None):
         transform (callable): Transform to apply to frames
     Returns:
-        Tuple[torch.Tensor, float]: Video tensor [T,C,H,W] and FPS
     """
     if transform is None:
         transform = transforms.Compose([
@@ -41,22 +44,52 @@ def load_media(media_path, max_frames=49, transform=None):
     is_video = ext in ['.mp4', '.avi', '.mov']
     if is_video:
-        frames = load_video(media_path)
-        fps = len(frames) / VideoFileClip(media_path).duration
     else:
         # Handle image as single frame
         image = load_image(media_path)
         frames = [image]
         fps = 8  # Default fps for images
-    # Ensure we have exactly max_frames
-    if len(frames) > max_frames:
-        frames = frames[:max_frames]
-    elif len(frames) < max_frames:
-        last_frame = frames[-1]
         while len(frames) < max_frames:
-            frames.append(last_frame.copy())
     # Convert frames to tensor
     video_tensor = torch.stack([transform(frame) for frame in frames])
@@ -77,8 +110,8 @@ if __name__ == "__main__":
                     help='Camera motion mode: "trans <dx> <dy> <dz>" or "rot <axis> <angle>" or "spiral <radius>"')
     parser.add_argument('--object_motion', type=str, default=None, help='Object motion mode: up/down/left/right')
     parser.add_argument('--object_mask', type=str, default=None, help='Path to object mask image (binary image)')
-    parser.add_argument('--tracking_method', type=str, default='spatracker', choices=['spatracker', 'moge'],
-                    help='Tracking method to use (spatracker or moge)')
     args = parser.parse_args()
     # Load input video/image
@@ -89,6 +122,7 @@ if __name__ == "__main__":
     # Initialize pipeline
     das = DiffusionAsShaderPipeline(gpu_id=args.gpu, output_dir=args.output_dir)
     if args.tracking_method == "moge" and args.tracking_path is None:
         moge = MoGeModel.from_pretrained("Ruicheng/moge-vitl").to(das.device)
@@ -153,7 +187,7 @@ if __name__ == "__main__":
             poses = torch.eye(4).unsqueeze(0).repeat(49, 1, 1)
         # change pred_tracks into screen coordinate
         pred_tracks_flatten = pred_tracks.reshape(video_tensor.shape[0], H*W, 3)
-        pred_tracks = cam_motion.w2s(pred_tracks_flatten, poses).reshape([video_tensor.shape[0], H, W, 3]) # [T, H, W, 3]
         _, tracking_tensor = das.visualize_tracking_moge(
             pred_tracks.cpu().numpy(),
             infer_result["mask"].cpu().numpy()
@@ -161,13 +195,44 @@ if __name__ == "__main__":
         print('export tracking video via MoGe.')
     else:
-        # Generate tracking points
-        pred_tracks, pred_visibility, T_Firsts = das.generate_tracking_spatracker(video_tensor)
         # Apply camera motion if specified
         if args.camera_motion:
             poses = cam_motion.get_default_motion() # shape: [49, 4, 4]
-            pred_tracks = cam_motion.apply_motion_on_pts(pred_tracks, poses)
             print("Camera motion applied")
         # Apply object motion if specified
@@ -184,7 +249,7 @@ if __name__ == "__main__":
             motion_generator = ObjectMotionGenerator(device=das.device)
             pred_tracks = motion_generator.apply_motion(
-                pred_tracks=pred_tracks.squeeze(),
                 mask=mask,
                 motion_type=args.object_motion,
                 distance=50,
@@ -193,12 +258,14 @@ if __name__ == "__main__":
             ).unsqueeze(0)
             print(f"Object motion '{args.object_motion}' applied using mask from {args.object_mask}")
-        # Generate tracking tensor from modified tracks
-        _, tracking_tensor = das.visualize_tracking_spatracker(video_tensor, pred_tracks, pred_visibility, T_Firsts)
     das.apply_tracking(
         video_tensor=video_tensor,
-        fps=8,
         tracking_tensor=tracking_tensor,
         img_cond_tensor=repaint_img_tensor,
         prompt=args.prompt,

 project_root = os.path.dirname(os.path.abspath(__file__))
 try:
     sys.path.append(os.path.join(project_root, "submodules/MoGe"))
+    sys.path.append(os.path.join(project_root, "submodules/vggt"))
     os.environ["TOKENIZERS_PARALLELISM"] = "false"
 except:
     print("Warning: MoGe not found, motion transfer will not be applied")
 from models.pipelines import DiffusionAsShaderPipeline, FirstFrameRepainter, CameraMotionGenerator, ObjectMotionGenerator
 from submodules.MoGe.moge.model import MoGeModel
+from submodules.vggt.vggt.utils.pose_enc import pose_encoding_to_extri_intri
+from submodules.vggt.vggt.models.vggt import VGGT
 def load_media(media_path, max_frames=49, transform=None):
     """Load video or image frames and convert to tensor
         transform (callable): Transform to apply to frames
     Returns:
+        Tuple[torch.Tensor, float, bool]: Video tensor [T,C,H,W], FPS, and is_video flag
     """
     if transform is None:
         transform = transforms.Compose([
     is_video = ext in ['.mp4', '.avi', '.mov']
     if is_video:
+        # Load video file info
+        video_clip = VideoFileClip(media_path)
+        duration = video_clip.duration
+        original_fps = video_clip.fps
+        # Case 1: Video longer than 6 seconds, sample first 6 seconds + 1 frame
+        if duration > 6.0:
+            sampling_fps = 8  # 8 frames per second
+            frames = load_video(media_path, sampling_fps=sampling_fps, max_frames=max_frames)
+            fps = sampling_fps
+        # Cases 2 and 3: Video shorter than 6 seconds
+        else:
+            # Load all frames
+            frames = load_video(media_path)
+            # Case 2: Total frames less than max_frames, need interpolation
+            if len(frames) < max_frames:
+                fps = len(frames) / duration  # Keep original fps
+                # Evenly interpolate to max_frames
+                indices = np.linspace(0, len(frames) - 1, max_frames)
+                new_frames = []
+                for i in indices:
+                    idx = int(i)
+                    new_frames.append(frames[idx])
+                frames = new_frames
+            # Case 3: Total frames more than max_frames but video less than 6 seconds
+            else:
+                # Evenly sample to max_frames
+                indices = np.linspace(0, len(frames) - 1, max_frames)
+                new_frames = []
+                for i in indices:
+                    idx = int(i)
+                    new_frames.append(frames[idx])
+                frames = new_frames
+                fps = max_frames / duration  # New fps to maintain duration
     else:
         # Handle image as single frame
         image = load_image(media_path)
         frames = [image]
         fps = 8  # Default fps for images
+        # Duplicate frame to max_frames
         while len(frames) < max_frames:
+            frames.append(frames[0].copy())
     # Convert frames to tensor
     video_tensor = torch.stack([transform(frame) for frame in frames])
                     help='Camera motion mode: "trans <dx> <dy> <dz>" or "rot <axis> <angle>" or "spiral <radius>"')
     parser.add_argument('--object_motion', type=str, default=None, help='Object motion mode: up/down/left/right')
     parser.add_argument('--object_mask', type=str, default=None, help='Path to object mask image (binary image)')
+    parser.add_argument('--tracking_method', type=str, default='spatracker', choices=['spatracker', 'moge', 'cotracker'],
+                    help='Tracking method to use (spatracker, cotracker or moge)')
     args = parser.parse_args()
     # Load input video/image
     # Initialize pipeline
     das = DiffusionAsShaderPipeline(gpu_id=args.gpu, output_dir=args.output_dir)
+    das.fps = fps
     if args.tracking_method == "moge" and args.tracking_path is None:
         moge = MoGeModel.from_pretrained("Ruicheng/moge-vitl").to(das.device)
             poses = torch.eye(4).unsqueeze(0).repeat(49, 1, 1)
         # change pred_tracks into screen coordinate
         pred_tracks_flatten = pred_tracks.reshape(video_tensor.shape[0], H*W, 3)
+        pred_tracks = cam_motion.w2s_moge(pred_tracks_flatten, poses).reshape([video_tensor.shape[0], H, W, 3]) # [T, H, W, 3]
         _, tracking_tensor = das.visualize_tracking_moge(
             pred_tracks.cpu().numpy(),
             infer_result["mask"].cpu().numpy()
         print('export tracking video via MoGe.')
     else:
+        if args.tracking_method == "cotracker":
+            pred_tracks, pred_visibility = das.generate_tracking_cotracker(video_tensor) # T N 3, T N
+        else:
+            pred_tracks, pred_visibility, T_Firsts = das.generate_tracking_spatracker(video_tensor) # T N 3, T N, B N
+        # Preprocess video tensor to match VGGT requirements
+        t, c, h, w = video_tensor.shape
+        new_width = 518
+        new_height = round(h * (new_width / w) / 14) * 14
+        resize_transform = transforms.Resize((new_height, new_width), interpolation=Image.BICUBIC)
+        video_vggt = resize_transform(video_tensor)  # [T, C, H, W]
+        if new_height > 518:
+            start_y = (new_height - 518) // 2
+            video_vggt = video_vggt[:, :, start_y:start_y + 518, :]
+        # Get extrinsic and intrinsic matrices
+        vggt_model = VGGT.from_pretrained("facebook/VGGT-1B").to(das.device)
+        with torch.no_grad():
+            with torch.cuda.amp.autocast(dtype=das.dtype):
+                video_vggt = video_vggt.unsqueeze(0)  # [1, T, C, H, W]
+                aggregated_tokens_list, ps_idx = vggt_model.aggregator(video_vggt.to(das.device))
+                # Extrinsic and intrinsic matrices, following OpenCV convention (camera from world)
+                extr, intr = pose_encoding_to_extri_intri(vggt_model.camera_head(aggregated_tokens_list)[-1], video_vggt.shape[-2:])
+                depth_map, depth_conf = vggt_model.depth_head(aggregated_tokens_list, video_vggt, ps_idx)
+        cam_motion.set_intr(intr)
+        cam_motion.set_extr(extr)
         # Apply camera motion if specified
         if args.camera_motion:
             poses = cam_motion.get_default_motion() # shape: [49, 4, 4]
+            pred_tracks_world = cam_motion.s2w_vggt(pred_tracks, extr, intr)
+            pred_tracks = cam_motion.w2s_vggt(pred_tracks_world, extr, intr, poses) # [T, N, 3]
             print("Camera motion applied")
         # Apply object motion if specified
             motion_generator = ObjectMotionGenerator(device=das.device)
             pred_tracks = motion_generator.apply_motion(
+                pred_tracks=pred_tracks,
                 mask=mask,
                 motion_type=args.object_motion,
                 distance=50,
             ).unsqueeze(0)
             print(f"Object motion '{args.object_motion}' applied using mask from {args.object_mask}")
+        if args.tracking_method == "cotracker":
+            _, tracking_tensor = das.visualize_tracking_cotracker(pred_tracks, pred_visibility)
+        else:
+            _, tracking_tensor = das.visualize_tracking_spatracker(video_tensor, pred_tracks, pred_visibility, T_Firsts)
     das.apply_tracking(
         video_tensor=video_tensor,
+        fps=fps,
         tracking_tensor=tracking_tensor,
         img_cond_tensor=repaint_img_tensor,
         prompt=args.prompt,

models/pipelines.py CHANGED Viewed

@@ -22,9 +22,9 @@ from models.spatracker.utils.visualizer import Visualizer
 from models.cogvideox_tracking import CogVideoXImageToVideoPipelineTracking
 from submodules.MoGe.moge.model import MoGeModel
 from image_gen_aux import DepthPreprocessor
 from moviepy.editor import ImageSequenceClip
-import spaces
 class DiffusionAsShaderPipeline:
     def __init__(self, gpu_id=0, output_dir='outputs'):
@@ -45,6 +45,7 @@ class DiffusionAsShaderPipeline:
         # device
         self.device = f"cuda:{gpu_id}"
         torch.cuda.set_device(gpu_id)
         # files
         self.output_dir = output_dir
@@ -56,7 +57,6 @@ class DiffusionAsShaderPipeline:
             transforms.ToTensor()
         ])
-    @spaces.GPU(duration=240)
     @torch.no_grad()
     def _infer(
         self,
@@ -65,7 +65,7 @@ class DiffusionAsShaderPipeline:
         tracking_tensor: torch.Tensor = None,
         image_tensor: torch.Tensor = None,  # [C,H,W] in range [0,1]
         output_path: str = "./output.mp4",
-        num_inference_steps: int = 50,
         guidance_scale: float = 6.0,
         num_videos_per_prompt: int = 1,
         dtype: torch.dtype = torch.bfloat16,
@@ -114,6 +114,8 @@ class DiffusionAsShaderPipeline:
         pipe.text_encoder.eval()
         pipe.vae.eval()
         # Process tracking tensor
         tracking_maps = tracking_tensor.float() # [T, C, H, W]
         tracking_maps = tracking_maps.to(device=self.device, dtype=dtype)
@@ -167,60 +169,9 @@ class DiffusionAsShaderPipeline:
     def _set_camera_motion(self, camera_motion):
         self.camera_motion = camera_motion
-    def _get_intr(self, fov, H=480, W=720):
-        fov_rad = math.radians(fov)
-        focal_length = (W / 2) / math.tan(fov_rad / 2)
-        cx = W / 2
-        cy = H / 2
-        intr = torch.tensor([
-            [focal_length, 0, cx],
-            [0, focal_length, cy],
-            [0, 0, 1]
-        ], dtype=torch.float32)
-        return intr
-    @spaces.GPU
-    def _apply_poses(self, pts, intr, poses):
-        """
-        Args:
-            pts (torch.Tensor): pointclouds coordinates [T, N, 3]
-            intr (torch.Tensor): camera intrinsics [T, 3, 3]
-            poses (numpy.ndarray): camera poses [T, 4, 4]
-        """
-        poses = torch.from_numpy(poses).float().to(self.device)
-        T, N, _ = pts.shape
-        ones = torch.ones(T, N, 1, device=self.device, dtype=torch.float)
-        pts_hom = torch.cat([pts[:, :, :2], ones], dim=-1)  # (T, N, 3)
-        pts_cam = torch.bmm(pts_hom, torch.linalg.inv(intr).transpose(1, 2))  # (T, N, 3)
-        pts_cam[:,:, :3] /= pts[:, :, 2:3]
-        # to homogeneous
-        pts_cam = torch.cat([pts_cam, ones], dim=-1)  # (T, N, 4)
-        if poses.shape[0] == 1:
-            poses = poses.repeat(T, 1, 1)
-        elif poses.shape[0] != T:
-            raise ValueError(f"Poses length ({poses.shape[0]}) must match sequence length ({T})")
-        pts_world = torch.bmm(pts_cam, poses.transpose(1, 2))[:, :, :3]  # (T, N, 3)
-        pts_proj = torch.bmm(pts_world, intr.transpose(1, 2))  # (T, N, 3)
-        pts_proj[:, :, :2] /= pts_proj[:, :, 2:3]
-        return pts_proj
-    def apply_traj_on_tracking(self, pred_tracks, camera_motion=None, fov=55, frame_num=49):
-        intr = self._get_intr(fov).unsqueeze(0).repeat(frame_num, 1, 1).to(self.device)
-        tracking_pts = self._apply_poses(pred_tracks.squeeze(), intr, camera_motion).unsqueeze(0)
-        return tracking_pts
     ##============= SpatialTracker =============##
-    @spaces.GPU
     def generate_tracking_spatracker(self, video_tensor, density=70):
         """Generate tracking video
@@ -233,7 +184,7 @@ class DiffusionAsShaderPipeline:
         print("Loading tracking models...")
         # Load tracking model
         tracker = SpaTrackerPredictor(
-            checkpoint=os.path.join(project_root, 'checkpoints/spatracker/spaT_final.pth'),
             interp_shape=(384, 576),
             seq_length=12
         ).to(self.device)
@@ -268,14 +219,13 @@ class DiffusionAsShaderPipeline:
                 progressive_tracking=False
             )
-            return pred_tracks, pred_visibility, T_Firsts
         finally:
             # Clean up GPU memory
             del tracker, self.depth_preprocessor
             torch.cuda.empty_cache()
-    @spaces.GPU
     def visualize_tracking_spatracker(self, video, pred_tracks, pred_visibility, T_Firsts, save_tracking=True):
         video = video.unsqueeze(0).to(self.device)
         vis = Visualizer(save_dir=self.output_dir, grayscale=False, fps=24, pad_value=0)
@@ -365,7 +315,6 @@ class DiffusionAsShaderPipeline:
             outline=tuple(color),
         )
-    @spaces.GPU
     def visualize_tracking_moge(self, points, mask, save_tracking=True):
         """Visualize tracking results from MoGe model
@@ -399,8 +348,6 @@ class DiffusionAsShaderPipeline:
         normalized_z = np.clip((inv_z - p2) / (p98 - p2), 0, 1)
         colors[:, :, 2] = (normalized_z * 255).astype(np.uint8)
         colors = colors.astype(np.uint8)
-        # colors = colors * mask[..., None]
-        # points = points * mask[None, :, :, None]
         points = points.reshape(T, -1, 3)
         colors = colors.reshape(-1, 3)
@@ -408,7 +355,7 @@ class DiffusionAsShaderPipeline:
         # Initialize list to store frames
         frames = []
-        for i, pts_i in enumerate(tqdm(points)):
             pixels, depths = pts_i[..., :2], pts_i[..., 2]
             pixels[..., 0] = pixels[..., 0] * W
             pixels[..., 1] = pixels[..., 1] * H
@@ -451,8 +398,178 @@ class DiffusionAsShaderPipeline:
                 tracking_path = None
         return tracking_path, tracking_video
-    @spaces.GPU(duration=240)
     def apply_tracking(self, video_tensor, fps=8, tracking_tensor=None, img_cond_tensor=None, prompt=None, checkpoint_path=None):
         """Generate final video with motion transfer
@@ -478,7 +595,7 @@ class DiffusionAsShaderPipeline:
             tracking_tensor=tracking_tensor,
             image_tensor=img_cond_tensor,
             output_path=final_output,
-            num_inference_steps=50,
             guidance_scale=6.0,
             dtype=torch.bfloat16,
             fps=self.fps
@@ -493,7 +610,6 @@ class DiffusionAsShaderPipeline:
         """
         self.object_motion = motion_type
-@spaces.GPU(duration=120)
 class FirstFrameRepainter:
     def __init__(self, gpu_id=0, output_dir='outputs'):
         """Initialize FirstFrameRepainter
@@ -506,8 +622,7 @@ class FirstFrameRepainter:
         self.output_dir = output_dir
         self.max_depth = 65.0
         os.makedirs(output_dir, exist_ok=True)
-    @spaces.GPU(duration=120)
     def repaint(self, image_tensor, prompt, depth_path=None, method="dav"):
         """Repaint first frame using Flux
@@ -599,48 +714,158 @@ class CameraMotionGenerator:
             fx = fy = (W / 2) / math.tan(fov_rad / 2)
         self.intr[0, 0] = fx
-        self.intr[1, 1] = fy
-    def _apply_poses(self, pts, poses):
         """
         Args:
-            pts (torch.Tensor): pointclouds coordinates [T, N, 3]
-            intr (torch.Tensor): camera intrinsics [T, 3, 3]
-            poses (numpy.ndarray): camera poses [T, 4, 4]
         """
-        if isinstance(poses, np.ndarray):
-            poses = torch.from_numpy(poses)
-        intr = self.intr.unsqueeze(0).repeat(self.frame_num, 1, 1).to(torch.float)
-        T, N, _ = pts.shape
-        ones = torch.ones(T, N, 1, device=self.device, dtype=torch.float)
-        pts_hom = torch.cat([pts[:, :, :2], ones], dim=-1)  # (T, N, 3)
-        pts_cam = torch.bmm(pts_hom, torch.linalg.inv(intr).transpose(1, 2))  # (T, N, 3)
-        pts_cam[:,:, :3] *= pts[:, :, 2:3]
-        # to homogeneous
-        pts_cam = torch.cat([pts_cam, ones], dim=-1)  # (T, N, 4)
-        if poses.shape[0] == 1:
-            poses = poses.repeat(T, 1, 1)
-        elif poses.shape[0] != T:
-            raise ValueError(f"Poses length ({poses.shape[0]}) must match sequence length ({T})")
-        poses = poses.to(torch.float).to(self.device)
-        pts_world = torch.bmm(pts_cam, poses.transpose(1, 2))[:, :, :3]  # (T, N, 3)
-        pts_proj = torch.bmm(pts_world, intr.transpose(1, 2))  # (T, N, 3)
-        pts_proj[:, :, :2] /= pts_proj[:, :, 2:3]
-        return pts_proj
-    def w2s(self, pts, poses):
         if isinstance(poses, np.ndarray):
             poses = torch.from_numpy(poses)
         assert poses.shape[0] == self.frame_num
         poses = poses.to(torch.float32).to(self.device)
         T, N, _ = pts.shape  # (T, N, 3)
         intr = self.intr.unsqueeze(0).repeat(self.frame_num, 1, 1)
-        # Step 1: 扩展点的维度，使其变成 (T, N, 4)，最后一维填充1 (齐次坐标)
         ones = torch.ones((T, N, 1), device=self.device, dtype=pts.dtype)
         points_world_h = torch.cat([pts, ones], dim=-1)
         points_camera_h = torch.bmm(poses, points_world_h.permute(0, 2, 1))
@@ -649,22 +874,21 @@ class CameraMotionGenerator:
         points_image_h = torch.bmm(points_camera, intr.permute(0, 2, 1))
         uv = points_image_h[:, :, :2] / points_image_h[:, :, 2:3]
-        # Step 5: 提取深度 (Z) 并拼接
         depth = points_camera[:, :, 2:3]  # (T, N, 1)
         uvd = torch.cat([uv, depth], dim=-1)  # (T, N, 3)
-        return uvd  # 屏幕坐标 + 深度 (T, N, 3)
-    def apply_motion_on_pts(self, pts, camera_motion):
-        tracking_pts = self._apply_poses(pts.squeeze(), camera_motion).unsqueeze(0)
-        return tracking_pts
     def set_intr(self, K):
         if isinstance(K, np.ndarray):
             K = torch.from_numpy(K)
         self.intr = K.to(self.device)
     def rot_poses(self, angle, axis='y'):
         """Generate a single rotation matrix
@@ -783,26 +1007,6 @@ class CameraMotionGenerator:
         camera_poses = np.concatenate(cam_poses, axis=0)
         return torch.from_numpy(camera_poses).to(self.device)
-    def rot(self, pts, angle, axis):
-        """
-        pts: torch.Tensor, (T, N, 2)
-        """
-        rot_mats = self.rot_poses(angle, axis)
-        pts = self.apply_motion_on_pts(pts, rot_mats)
-        return pts
-    def trans(self, pts, dx, dy, dz):
-        if pts.shape[-1] != 3:
-            raise ValueError("points should be in the 3d coordinate.")
-        trans_mats = self.trans_poses(dx, dy, dz)
-        pts = self.apply_motion_on_pts(pts, trans_mats)
-        return pts
-    def spiral(self, pts, radius):
-        spiral_poses = self.spiral_poses(radius)
-        pts = self.apply_motion_on_pts(pts, spiral_poses)
-        return pts
     def get_default_motion(self):
         """Parse motion parameters and generate corresponding motion matrices
@@ -820,6 +1024,7 @@ class CameraMotionGenerator:
             - if not specified, defaults to 0-49
             - frames after end_frame will maintain the final transformation
             - for combined transformations, they are applied in sequence
         Returns:
             torch.Tensor: Motion matrices [num_frames, 4, 4]

 from models.cogvideox_tracking import CogVideoXImageToVideoPipelineTracking
 from submodules.MoGe.moge.model import MoGeModel
 from image_gen_aux import DepthPreprocessor
 from moviepy.editor import ImageSequenceClip
 class DiffusionAsShaderPipeline:
     def __init__(self, gpu_id=0, output_dir='outputs'):
         # device
         self.device = f"cuda:{gpu_id}"
         torch.cuda.set_device(gpu_id)
+        self.dtype = torch.bfloat16
         # files
         self.output_dir = output_dir
             transforms.ToTensor()
         ])
     @torch.no_grad()
     def _infer(
         self,
         tracking_tensor: torch.Tensor = None,
         image_tensor: torch.Tensor = None,  # [C,H,W] in range [0,1]
         output_path: str = "./output.mp4",
+        num_inference_steps: int = 25,
         guidance_scale: float = 6.0,
         num_videos_per_prompt: int = 1,
         dtype: torch.dtype = torch.bfloat16,
         pipe.text_encoder.eval()
         pipe.vae.eval()
+        self.dtype = dtype
         # Process tracking tensor
         tracking_maps = tracking_tensor.float() # [T, C, H, W]
         tracking_maps = tracking_maps.to(device=self.device, dtype=dtype)
     def _set_camera_motion(self, camera_motion):
         self.camera_motion = camera_motion
     ##============= SpatialTracker =============##
     def generate_tracking_spatracker(self, video_tensor, density=70):
         """Generate tracking video
         print("Loading tracking models...")
         # Load tracking model
         tracker = SpaTrackerPredictor(
+            checkpoint=os.path.join(project_root, 'checkpoints/spaT_final.pth'),
             interp_shape=(384, 576),
             seq_length=12
         ).to(self.device)
                 progressive_tracking=False
             )
+            return pred_tracks.squeeze(0), pred_visibility.squeeze(0), T_Firsts
         finally:
             # Clean up GPU memory
             del tracker, self.depth_preprocessor
             torch.cuda.empty_cache()
     def visualize_tracking_spatracker(self, video, pred_tracks, pred_visibility, T_Firsts, save_tracking=True):
         video = video.unsqueeze(0).to(self.device)
         vis = Visualizer(save_dir=self.output_dir, grayscale=False, fps=24, pad_value=0)
             outline=tuple(color),
         )
     def visualize_tracking_moge(self, points, mask, save_tracking=True):
         """Visualize tracking results from MoGe model
         normalized_z = np.clip((inv_z - p2) / (p98 - p2), 0, 1)
         colors[:, :, 2] = (normalized_z * 255).astype(np.uint8)
         colors = colors.astype(np.uint8)
         points = points.reshape(T, -1, 3)
         colors = colors.reshape(-1, 3)
         # Initialize list to store frames
         frames = []
+        for i, pts_i in enumerate(tqdm(points, desc="rendering frames")):
             pixels, depths = pts_i[..., :2], pts_i[..., 2]
             pixels[..., 0] = pixels[..., 0] * W
             pixels[..., 1] = pixels[..., 1] * H
                 tracking_path = None
         return tracking_path, tracking_video
+    ##============= CoTracker =============##
+    def generate_tracking_cotracker(self, video_tensor, density=70):
+        """Generate tracking video
+        Args:
+            video_tensor (torch.Tensor): Input video tensor
+        Returns:
+            tuple: (pred_tracks, pred_visibility)
+                - pred_tracks (torch.Tensor): Tracking points with depth [T, N, 3]
+                - pred_visibility (torch.Tensor): Visibility mask [T, N, 1]
+        """
+        # Generate tracking points
+        cotracker = torch.hub.load("facebookresearch/co-tracker", "cotracker3_offline").to(self.device)
+        # Load depth model
+        if not hasattr(self, 'depth_preprocessor') or self.depth_preprocessor is None:
+            self.depth_preprocessor = DepthPreprocessor.from_pretrained("Intel/zoedepth-nyu-kitti")
+            self.depth_preprocessor.to(self.device)
+        try:
+            video = video_tensor.unsqueeze(0).to(self.device)
+            # Process all frames to get depth maps
+            video_depths = []
+            for i in tqdm(range(video_tensor.shape[0]), desc="estimating depth"):
+                frame = (video_tensor[i].permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8)
+                depth = self.depth_preprocessor(Image.fromarray(frame))[0]
+                depth_tensor = transforms.ToTensor()(depth)  # [1, H, W]
+                video_depths.append(depth_tensor)
+            video_depth = torch.stack(video_depths, dim=0).to(self.device)  # [T, 1, H, W]
+            # Get tracking points and visibility
+            print("tracking...")
+            pred_tracks, pred_visibility = cotracker(video, grid_size=density)  # B T N 2,  B T N 1
+            # Extract dimensions
+            B, T, N, _ = pred_tracks.shape
+            H, W = video_depth.shape[2], video_depth.shape[3]
+            # Create output tensor with depth
+            pred_tracks_with_depth = torch.zeros((B, T, N, 3), device=self.device)
+            pred_tracks_with_depth[:, :, :, :2] = pred_tracks  # Copy x,y coordinates
+            # Vectorized approach to get depths for all points
+            # Reshape pred_tracks to process all batches and frames at once
+            flat_tracks = pred_tracks.reshape(B*T, N, 2)
+            # Clamp coordinates to valid image bounds
+            x_coords = flat_tracks[:, :, 0].clamp(0, W-1).long()  # [B*T, N]
+            y_coords = flat_tracks[:, :, 1].clamp(0, H-1).long()  # [B*T, N]
+            # Get depths for all points at once
+            # For each point in the flattened batch, get its depth from the corresponding frame
+            depths = torch.zeros((B*T, N), device=self.device)
+            for bt in range(B*T):
+                t = bt % T  # Time index
+                depths[bt] = video_depth[t, 0, y_coords[bt], x_coords[bt]]
+            # Reshape depths back to [B, T, N] and assign to output tensor
+            pred_tracks_with_depth[:, :, :, 2] = depths.reshape(B, T, N)
+            return pred_tracks_with_depth.squeeze(0), pred_visibility.squeeze(0)
+        finally:
+            del cotracker
+            torch.cuda.empty_cache()
+    def visualize_tracking_cotracker(self, points, vis_mask=None, save_tracking=True, point_wise=4, video_size=(480, 720)):
+        """Visualize tracking results from CoTracker
+        Args:
+            points (torch.Tensor): Points array of shape [T, N, 3]
+            vis_mask (torch.Tensor): Visibility mask of shape [T, N, 1]
+            save_tracking (bool): Whether to save tracking video
+            point_wise (int): Size of points in visualization
+            video_size (tuple): Render size (height, width)
+        Returns:
+            tuple: (tracking_path, tracking_video)
+        """
+        # Move tensors to CPU and convert to numpy
+        if isinstance(points, torch.Tensor):
+            points = points.detach().cpu().numpy()
+        if vis_mask is not None and isinstance(vis_mask, torch.Tensor):
+            vis_mask = vis_mask.detach().cpu().numpy()
+            # Reshape if needed
+            if vis_mask.ndim == 3 and vis_mask.shape[2] == 1:
+                vis_mask = vis_mask.squeeze(-1)
+        T, N, _ = points.shape
+        H, W = video_size
+        if vis_mask is None:
+            vis_mask = np.ones((T, N), dtype=bool)
+        colors = np.zeros((N, 3), dtype=np.uint8)
+        first_frame_pts = points[0]
+        u_min, u_max = 0, W
+        u_normalized = np.clip((first_frame_pts[:, 0] - u_min) / (u_max - u_min), 0, 1)
+        colors[:, 0] = (u_normalized * 255).astype(np.uint8)
+        v_min, v_max = 0, H
+        v_normalized = np.clip((first_frame_pts[:, 1] - v_min) / (v_max - v_min), 0, 1)
+        colors[:, 1] = (v_normalized * 255).astype(np.uint8)
+        z_values = first_frame_pts[:, 2]
+        if np.all(z_values == 0):
+            colors[:, 2] = np.random.randint(0, 256, N, dtype=np.uint8)
+        else:
+            inv_z = 1 / (z_values + 1e-10)
+            p2 = np.percentile(inv_z, 2)
+            p98 = np.percentile(inv_z, 98)
+            normalized_z = np.clip((inv_z - p2) / (p98 - p2 + 1e-10), 0, 1)
+            colors[:, 2] = (normalized_z * 255).astype(np.uint8)
+        frames = []
+        for i in tqdm(range(T), desc="rendering frames"):
+            pts_i = points[i]
+            visibility = vis_mask[i]
+            pixels, depths = pts_i[visibility, :2], pts_i[visibility, 2]
+            pixels = pixels.astype(int)
+            in_frame = self.valid_mask(pixels, W, H)
+            pixels = pixels[in_frame]
+            depths = depths[in_frame]
+            frame_rgb = colors[visibility][in_frame]
+            img = Image.fromarray(np.zeros((H, W, 3), dtype=np.uint8), mode="RGB")
+            sorted_pixels, _, sort_index = self.sort_points_by_depth(pixels, depths)
+            sorted_rgb = frame_rgb[sort_index]
+            for j in range(sorted_pixels.shape[0]):
+                self.draw_rectangle(
+                    img,
+                    coord=(sorted_pixels[j, 0], sorted_pixels[j, 1]),
+                    side_length=point_wise,
+                    color=sorted_rgb[j],
+                )
+            frames.append(np.array(img))
+        # Convert frames to video tensor in range [0,1]
+        tracking_video = torch.from_numpy(np.stack(frames)).permute(0, 3, 1, 2).float() / 255.0
+        tracking_path = None
+        if save_tracking:
+            try:
+                tracking_path = os.path.join(self.output_dir, "tracking_video_cotracker.mp4")
+                # Convert back to uint8 for saving
+                uint8_frames = [frame.astype(np.uint8) for frame in frames]
+                clip = ImageSequenceClip(uint8_frames, fps=self.fps)
+                clip.write_videofile(tracking_path, codec="libx264", fps=self.fps, logger=None)
+                print(f"Video saved to {tracking_path}")
+            except Exception as e:
+                print(f"Warning: Failed to save tracking video: {e}")
+                tracking_path = None
+        return tracking_path, tracking_video
     def apply_tracking(self, video_tensor, fps=8, tracking_tensor=None, img_cond_tensor=None, prompt=None, checkpoint_path=None):
         """Generate final video with motion transfer
             tracking_tensor=tracking_tensor,
             image_tensor=img_cond_tensor,
             output_path=final_output,
+            num_inference_steps=25,
             guidance_scale=6.0,
             dtype=torch.bfloat16,
             fps=self.fps
         """
         self.object_motion = motion_type
 class FirstFrameRepainter:
     def __init__(self, gpu_id=0, output_dir='outputs'):
         """Initialize FirstFrameRepainter
         self.output_dir = output_dir
         self.max_depth = 65.0
         os.makedirs(output_dir, exist_ok=True)
     def repaint(self, image_tensor, prompt, depth_path=None, method="dav"):
         """Repaint first frame using Flux
             fx = fy = (W / 2) / math.tan(fov_rad / 2)
         self.intr[0, 0] = fx
+        self.intr[1, 1] = fy
+        self.extr = torch.eye(4, device=device)
+    def s2w_vggt(self, points, extrinsics, intrinsics):
         """
+        Transform points from pixel coordinates to world coordinates
         Args:
+            points: Point cloud data of shape [T, N, 3] in uvz format
+            extrinsics: Camera extrinsic matrices [B, T, 3, 4] or [T, 3, 4]
+            intrinsics: Camera intrinsic matrices [B, T, 3, 3] or [T, 3, 3]
+        Returns:
+            world_points: Point cloud in world coordinates [T, N, 3]
         """
+        if isinstance(points, torch.Tensor):
+            points = points.detach().cpu().numpy()
+        if isinstance(extrinsics, torch.Tensor):
+            extrinsics = extrinsics.detach().cpu().numpy()
+            # Handle batch dimension
+            if extrinsics.ndim == 4:  # [B, T, 3, 4]
+                extrinsics = extrinsics[0]  # Take first batch
+        if isinstance(intrinsics, torch.Tensor):
+            intrinsics = intrinsics.detach().cpu().numpy()
+            # Handle batch dimension
+            if intrinsics.ndim == 4:  # [B, T, 3, 3]
+                intrinsics = intrinsics[0]  # Take first batch
+        T, N, _ = points.shape
+        world_points = np.zeros_like(points)
+        # Extract uvz coordinates
+        uvz = points
+        valid_mask = uvz[..., 2] > 0
+        # Create homogeneous coordinates [u, v, 1]
+        uv_homogeneous = np.concatenate([uvz[..., :2], np.ones((T, N, 1))], axis=-1)
+        # Transform from pixel to camera coordinates
+        for i in range(T):
+            K = intrinsics[i]
+            K_inv = np.linalg.inv(K)
+            R = extrinsics[i, :, :3]
+            t = extrinsics[i, :, 3]
+            R_inv = np.linalg.inv(R)
+            valid_indices = np.where(valid_mask[i])[0]
+            if len(valid_indices) > 0:
+                valid_uv = uv_homogeneous[i, valid_indices]
+                valid_z = uvz[i, valid_indices, 2]
+                valid_xyz_camera = valid_uv @ K_inv.T
+                valid_xyz_camera = valid_xyz_camera * valid_z[:, np.newaxis]
+                # Transform from camera to world coordinates: X_world = R^-1 * (X_camera - t)
+                valid_world_points = (valid_xyz_camera - t) @ R_inv.T
+                world_points[i, valid_indices] = valid_world_points
+        return world_points
+    def w2s_vggt(self, world_points, extrinsics, intrinsics, poses=None):
+        """
+        Project points from world coordinates to camera view
+        Args:
+            world_points: Point cloud in world coordinates [T, N, 3]
+            extrinsics: Original camera extrinsic matrices [B, T, 3, 4] or [T, 3, 4]
+            intrinsics: Camera intrinsic matrices [B, T, 3, 3] or [T, 3, 3]
+            poses: Camera pose matrices [T, 4, 4], if None use first frame extrinsics
+        Returns:
+            camera_points: Point cloud in camera coordinates [T, N, 3] in uvz format
+        """
+        if isinstance(world_points, torch.Tensor):
+            world_points = world_points.detach().cpu().numpy()
+        if isinstance(extrinsics, torch.Tensor):
+            extrinsics = extrinsics.detach().cpu().numpy()
+            if extrinsics.ndim == 4:
+                extrinsics = extrinsics[0]
+        if isinstance(intrinsics, torch.Tensor):
+            intrinsics = intrinsics.detach().cpu().numpy()
+            if intrinsics.ndim == 4:
+                intrinsics = intrinsics[0]
+        T, N, _ = world_points.shape
+        # If no poses provided, use first frame extrinsics
+        if poses is None:
+            pose1 = np.eye(4)
+            pose1[:3, :3] = extrinsics[0, :, :3]
+            pose1[:3, 3] = extrinsics[0, :, 3]
+            camera_poses = np.tile(pose1[np.newaxis, :, :], (T, 1, 1))
+        else:
+            if isinstance(poses, torch.Tensor):
+                camera_poses = poses.cpu().numpy()
+            else:
+                camera_poses = poses
+            # Scale translation by 1/5
+            scaled_poses = camera_poses.copy()
+            scaled_poses[:, :3, 3] = camera_poses[:, :3, 3] / 5.0
+            camera_poses = scaled_poses
+        # Add homogeneous coordinates
+        ones = np.ones([T, N, 1])
+        world_points_hom = np.concatenate([world_points, ones], axis=-1)
+        # Transform points using batch matrix multiplication
+        pts_cam_hom = np.matmul(world_points_hom, np.transpose(camera_poses, (0, 2, 1)))
+        pts_cam = pts_cam_hom[..., :3]
+        # Extract depth information
+        depths = pts_cam[..., 2:3]
+        valid_mask = depths[..., 0] > 0
+        # Normalize coordinates
+        normalized_pts = pts_cam / (depths + 1e-10)
+        # Apply intrinsic matrix for projection
+        pts_pixel = np.matmul(normalized_pts, np.transpose(intrinsics, (0, 2, 1)))
+        # Extract pixel coordinates
+        u = pts_pixel[..., 0:1]
+        v = pts_pixel[..., 1:2]
+        # Set invalid points to zero
+        u[~valid_mask] = 0
+        v[~valid_mask] = 0
+        depths[~valid_mask] = 0
+        # Return points in uvz format
+        result = np.concatenate([u, v, depths], axis=-1)
+        return torch.from_numpy(result)
+    def w2s_moge(self, pts, poses):
         if isinstance(poses, np.ndarray):
             poses = torch.from_numpy(poses)
         assert poses.shape[0] == self.frame_num
         poses = poses.to(torch.float32).to(self.device)
         T, N, _ = pts.shape  # (T, N, 3)
         intr = self.intr.unsqueeze(0).repeat(self.frame_num, 1, 1)
         ones = torch.ones((T, N, 1), device=self.device, dtype=pts.dtype)
         points_world_h = torch.cat([pts, ones], dim=-1)
         points_camera_h = torch.bmm(poses, points_world_h.permute(0, 2, 1))
         points_image_h = torch.bmm(points_camera, intr.permute(0, 2, 1))
         uv = points_image_h[:, :, :2] / points_image_h[:, :, 2:3]
         depth = points_camera[:, :, 2:3]  # (T, N, 1)
         uvd = torch.cat([uv, depth], dim=-1)  # (T, N, 3)
+        return uvd
     def set_intr(self, K):
         if isinstance(K, np.ndarray):
             K = torch.from_numpy(K)
         self.intr = K.to(self.device)
+    def set_extr(self, extr):
+        if isinstance(extr, np.ndarray):
+            extr = torch.from_numpy(extr)
+        self.extr = extr.to(self.device)
     def rot_poses(self, angle, axis='y'):
         """Generate a single rotation matrix
         camera_poses = np.concatenate(cam_poses, axis=0)
         return torch.from_numpy(camera_poses).to(self.device)
     def get_default_motion(self):
         """Parse motion parameters and generate corresponding motion matrices
             - if not specified, defaults to 0-49
             - frames after end_frame will maintain the final transformation
             - for combined transformations, they are applied in sequence
+            - moving left, up and zoom out is positive in video
         Returns:
             torch.Tensor: Motion matrices [num_frames, 4, 4]