Spaces:

MoonQiu
/

FreeTraj

Sleeping

App Files Files Community

Anonymous commited on Jul 12, 2024

Commit

e84616f

1 Parent(s): e994f84

add spaces

Browse files

Files changed (1) hide show

app.py +72 -57

app.py CHANGED Viewed

@@ -22,6 +22,9 @@ from funcs import (
 from utils.utils import instantiate_from_config
 from utils.utils_freetraj import plan_path
 MAX_KEYS = 5
 ckpt_dir_512 = "checkpoints/base_512_v2"
@@ -56,7 +59,7 @@ def check(radio_mode):
         video_bbox_path = "output_freetraj_bbox.mp4"
     return video_path, video_bbox_path
-@spaces.GPU(duration=270)
 def infer(*user_args):
     prompt_in = user_args[0]
     target_indices = user_args[1]
@@ -75,9 +78,6 @@ def infer(*user_args):
     w_positions = user_args[-MAX_KEYS:]
     print(user_args)
-    video_length = 16
-    width = 512
-    height = 320
     if radio_mode == 'ori':
         config_512 = "configs/inference_t2v_512_v2.0.yaml"
     else:
@@ -110,15 +110,6 @@ def infer(*user_args):
     config_512 = OmegaConf.load(config_512)
     model_config_512 = config_512.pop("model", OmegaConf.create())
-    model = instantiate_from_config(model_config_512)
-    model = model.cuda()
-    model = load_model_checkpoint(model, ckpt_path_512)
-    model.eval()
-    if seed is None:
-        seed = int.from_bytes(os.urandom(2), "big")
-    print(f"Using seed: {seed}")
-    seed_everything(seed)
     args = argparse.Namespace(
         mode="base",
@@ -127,57 +118,20 @@ def infer(*user_args):
         ddim_steps=ddim_steps,
         ddim_eta=0.0,
         bs=1,
-        height=height,
-        width=width,
-        frames=video_length,
         fps=video_fps,
         unconditional_guidance_scale=unconditional_guidance_scale,
         unconditional_guidance_scale_temporal=None,
         cond_input=None,
         ddim_edit = ddim_edit,
     )
-    ## latent noise shape
-    h, w = args.height // 8, args.width // 8
-    frames = model.temporal_length if args.frames < 0 else args.frames
-    channels = model.channels
-    batch_size = 1
-    noise_shape = [batch_size, channels, frames, h, w]
-    fps = torch.tensor([args.fps] * batch_size).to(model.device).long()
-    prompts = [prompt_in]
-    text_emb = model.get_learned_conditioning(prompts)
-    cond = {"c_crossattn": [text_emb], "fps": fps}
-    ## inference
-    if radio_mode == 'ori':
-        batch_samples = batch_ddim_sampling(
-            model,
-            cond,
-            noise_shape,
-            args.n_samples,
-            args.ddim_steps,
-            args.ddim_eta,
-            args.unconditional_guidance_scale,
-            args=args,
-        )
-    else:
-        batch_samples = batch_ddim_sampling_freetraj(
-            model,
-            cond,
-            noise_shape,
-            args.n_samples,
-            args.ddim_steps,
-            args.ddim_eta,
-            args.unconditional_guidance_scale,
-            idx_list = idx_list,
-            input_traj = input_traj,
-            args=args,
-        )
-    vid_tensor = batch_samples[0]
-    video = vid_tensor.detach().cpu()
     video = torch.clamp(video.float(), -1.0, 1.0)
     video = video.permute(2, 0, 1, 3, 4)  # t,n,c,h,w
@@ -251,6 +205,67 @@ def infer(*user_args):
     return video_path, video_bbox_path
 examples = [
     ["A squirrel jumping from one tree to another.",],

 from utils.utils import instantiate_from_config
 from utils.utils_freetraj import plan_path
+video_length = 16
+width = 512
+height = 320
 MAX_KEYS = 5
 ckpt_dir_512 = "checkpoints/base_512_v2"
         video_bbox_path = "output_freetraj_bbox.mp4"
     return video_path, video_bbox_path
 def infer(*user_args):
     prompt_in = user_args[0]
     target_indices = user_args[1]
     w_positions = user_args[-MAX_KEYS:]
     print(user_args)
     if radio_mode == 'ori':
         config_512 = "configs/inference_t2v_512_v2.0.yaml"
     else:
     config_512 = OmegaConf.load(config_512)
     model_config_512 = config_512.pop("model", OmegaConf.create())
     args = argparse.Namespace(
         mode="base",
         ddim_steps=ddim_steps,
         ddim_eta=0.0,
         bs=1,
         fps=video_fps,
         unconditional_guidance_scale=unconditional_guidance_scale,
         unconditional_guidance_scale_temporal=None,
         cond_input=None,
+        prompt_in = prompt_in,
+        seed = seed,
         ddim_edit = ddim_edit,
+        model_config_512 = model_config_512,
+        idx_list = idx_list,
+        input_traj = input_traj,
     )
+    video = infer_gpu_part(args)
     video = torch.clamp(video.float(), -1.0, 1.0)
     video = video.permute(2, 0, 1, 3, 4)  # t,n,c,h,w
     return video_path, video_bbox_path
+@spaces.GPU(duration=270)
+def infer_gpu_part(args):
+    model = instantiate_from_config(args.model_config_512)
+    model = model.cuda()
+    model = load_model_checkpoint(model, ckpt_path_512)
+    model.eval()
+    if args.seed is None:
+        seed = int.from_bytes(os.urandom(2), "big")
+    else:
+        seed = args.seed
+    print(f"Using seed: {seed}")
+    seed_everything(seed)
+    ## latent noise shape
+    h, w = height // 8, width // 8
+    frames = video_length
+    channels = model.channels
+    batch_size = 1
+    noise_shape = [batch_size, channels, frames, h, w]
+    fps = torch.tensor([args.fps] * batch_size).to(model.device).long()
+    prompts = [args.prompt_in]
+    text_emb = model.get_learned_conditioning(prompts)
+    cond = {"c_crossattn": [text_emb], "fps": fps}
+    ## inference
+    if radio_mode == 'ori':
+        batch_samples = batch_ddim_sampling(
+            model,
+            cond,
+            noise_shape,
+            args.n_samples,
+            args.ddim_steps,
+            args.ddim_eta,
+            args.unconditional_guidance_scale,
+            args=args,
+        )
+    else:
+        batch_samples = batch_ddim_sampling_freetraj(
+            model,
+            cond,
+            noise_shape,
+            args.n_samples,
+            args.ddim_steps,
+            args.ddim_eta,
+            args.unconditional_guidance_scale,
+            idx_list = args.idx_list,
+            input_traj = args.input_traj,
+            args=args,
+        )
+    vid_tensor = batch_samples[0]
+    video = vid_tensor.detach().cpu()
+    return video
 examples = [
     ["A squirrel jumping from one tree to another.",],