Spaces:

GeradeHouse
/

Wan2.1-FLF2V

Paused

App Files Files Community

GeradeHouse commited on Apr 25

Commit

f40229f

verified ·

1 Parent(s): c83344b

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -26

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 """
 Gradio demo for Wan2.1 First-Last-Frame-to-Video (FLF2V)
-Author: <your-handle>
 """
 import numpy as np
@@ -9,28 +9,29 @@ import torch
 import gradio as gr
 from diffusers import WanImageToVideoPipeline, AutoencoderKLWan
 from diffusers.utils import export_to_video
-from transformers import CLIPVisionModel
 from PIL import Image
 import torchvision.transforms.functional as TF
 # ---------------------------------------------------------------------
 # CONFIG ----------------------------------------------------------------
-MODEL_ID = "Wan-AI/Wan2.1-FLF2V-14B-720P-diffusers"  # switch to 1.3B if needed
-DTYPE = torch.float16                                 # or torch.bfloat16 on AMP-friendly GPUs
-MAX_AREA = 1280 * 720                                 # keep ≤ 720p
-DEFAULT_FRAMES = 81                                   # ≈ 5s at 16 fps
 # ----------------------------------------------------------------------
 def load_pipeline():
-    """Lazy‐load the huge model once per process."""
-    # image encoder in full precision
     image_encoder = CLIPVisionModel.from_pretrained(
         MODEL_ID, subfolder="image_encoder", torch_dtype=torch.float32
     )
-    # VAE in reduced precision
     vae = AutoencoderKLWan.from_pretrained(
         MODEL_ID, subfolder="vae", torch_dtype=DTYPE
     )
     pipe = WanImageToVideoPipeline.from_pretrained(
         MODEL_ID,
         vae=vae,
@@ -38,11 +39,14 @@ def load_pipeline():
         torch_dtype=DTYPE,
     )
-    # memory helpers for ≤ 24 GB cards / HF T4-medium
-    pipe.enable_model_cpu_offload()    # page UNet blocks off GPU
-    pipe.vae.enable_slicing()          # reduce VAE peak RAM
-    # Optional: if you have xformers installed
-    # pipe.enable_xformers_memory_efficient_attention()
     return pipe.to("cuda" if torch.cuda.is_available() else "cpu")
@@ -51,7 +55,7 @@ PIPE = load_pipeline()
 # ----------------------------------------------------------------------
 # UTILS ----------------------------------------------------------------
 def aspect_resize(img: Image.Image, max_area=MAX_AREA):
-    """Resize while respecting model patch size (multiple of transformer patch)."""
     ar = img.height / img.width
     mod = PIPE.vae_scale_factor_spatial * PIPE.transformer.config.patch_size[1]
     h = round(np.sqrt(max_area * ar)) // mod * mod
@@ -59,10 +63,11 @@ def aspect_resize(img: Image.Image, max_area=MAX_AREA):
     return img.resize((w, h), Image.LANCZOS), h, w
 def center_crop_resize(img: Image.Image, h, w):
-    """Center‐crop & resize to target H×W."""
     ratio = max(w / img.width, h / img.height)
     img = img.resize(
-        (round(img.width * ratio), round(img.height * ratio)), Image.LANCZOS
     )
     return TF.center_crop(img, [h, w])
@@ -71,11 +76,12 @@ def center_crop_resize(img: Image.Image, h, w):
 def generate(first_frame, last_frame, prompt, negative_prompt, steps,
              guidance, num_frames, seed, fps):
     if seed == -1:
         seed = torch.seed()
-    generator = torch.Generator(device=PIPE.device).manual_seed(seed)
-    # preprocess inputs
     first_frame, h, w = aspect_resize(first_frame)
     if last_frame.size != first_frame.size:
         last_frame = center_crop_resize(last_frame, h, w)
@@ -91,11 +97,11 @@ def generate(first_frame, last_frame, prompt, negative_prompt, steps,
         num_frames=num_frames,
         num_inference_steps=steps,
         guidance_scale=guidance,
-        generator=generator,
     )
-    frames = output.frames[0]  # list[PIL.Image]
-    # export to .mp4
     video_path = export_to_video(frames, fps=fps)
     return video_path, seed
@@ -108,8 +114,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         first_img = gr.Image(label="First frame", type="pil")
         last_img  = gr.Image(label="Last frame",  type="pil")
-    prompt   = gr.Textbox(label="Prompt", placeholder="A blue bird takes off…")
-    negative = gr.Textbox(label="Negative prompt (optional)", placeholder="ugly, blurry")
     with gr.Accordion("Advanced parameters", open=False):
         steps      = gr.Slider(10, 50, value=30, step=1, label="Sampling steps")
@@ -118,8 +124,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         fps        = gr.Slider(4, 30, value=16, step=1, label="FPS (export)")
         seed       = gr.Number(value=-1, precision=0, label="Seed (-1 = random)")
-    run_btn = gr.Button("Generate")
-    video   = gr.Video(label="Result (.mp4)")
     used_seed = gr.Number(label="Seed used", interactive=False)
     run_btn.click(

 #!/usr/bin/env python
 """
 Gradio demo for Wan2.1 First-Last-Frame-to-Video (FLF2V)
+Author: GeradeHouse
 """
 import numpy as np
 import gradio as gr
 from diffusers import WanImageToVideoPipeline, AutoencoderKLWan
 from diffusers.utils import export_to_video
+from transformers import CLIPVisionModel, CLIPImageProcessor
 from PIL import Image
 import torchvision.transforms.functional as TF
 # ---------------------------------------------------------------------
 # CONFIG ----------------------------------------------------------------
+MODEL_ID       = "Wan-AI/Wan2.1-FLF2V-14B-720P-diffusers"  # or switch to 1.3B
+DTYPE          = torch.float16                            # or bfloat16
+MAX_AREA       = 1280 * 720                                # ≤720p
+DEFAULT_FRAMES = 81                                        # ~5s @16 fps
 # ----------------------------------------------------------------------
 def load_pipeline():
+    """Lazy‐load & configure the pipeline once per process."""
+    # 1) load the CLIP image encoder (full-precision)
     image_encoder = CLIPVisionModel.from_pretrained(
         MODEL_ID, subfolder="image_encoder", torch_dtype=torch.float32
     )
+    # 2) load the VAE (half-precision)
     vae = AutoencoderKLWan.from_pretrained(
         MODEL_ID, subfolder="vae", torch_dtype=DTYPE
     )
+    # 3) load the video pipeline
     pipe = WanImageToVideoPipeline.from_pretrained(
         MODEL_ID,
         vae=vae,
         torch_dtype=DTYPE,
     )
+    # 4) override the processor with the fast Rust implementation
+    pipe.image_processor = CLIPImageProcessor.from_pretrained(
+        MODEL_ID, subfolder="image_processor", use_fast=True
+    )
+    # 5) memory helpers (offload UNet to CPU as needed)
+    # pipe.enable_model_cpu_offload()
+    # (Removed pipe.vae.enable_slicing() — not supported on AutoencoderKLWan)
     return pipe.to("cuda" if torch.cuda.is_available() else "cpu")
 # ----------------------------------------------------------------------
 # UTILS ----------------------------------------------------------------
 def aspect_resize(img: Image.Image, max_area=MAX_AREA):
+    """Resize while keeping aspect & respecting patch multiples."""
     ar = img.height / img.width
     mod = PIPE.vae_scale_factor_spatial * PIPE.transformer.config.patch_size[1]
     h = round(np.sqrt(max_area * ar)) // mod * mod
     return img.resize((w, h), Image.LANCZOS), h, w
 def center_crop_resize(img: Image.Image, h, w):
+    """Center‐crop & resize to H×W."""
     ratio = max(w / img.width, h / img.height)
     img = img.resize(
+        (round(img.width * ratio), round(img.height * ratio)),
+        Image.LANCZOS
     )
     return TF.center_crop(img, [h, w])
 def generate(first_frame, last_frame, prompt, negative_prompt, steps,
              guidance, num_frames, seed, fps):
+    # seed handling
     if seed == -1:
         seed = torch.seed()
+    gen = torch.Generator(device=PIPE.device).manual_seed(seed)
+    # preprocess frames
     first_frame, h, w = aspect_resize(first_frame)
     if last_frame.size != first_frame.size:
         last_frame = center_crop_resize(last_frame, h, w)
         num_frames=num_frames,
         num_inference_steps=steps,
         guidance_scale=guidance,
+        generator=gen,
     )
+    frames = output.frames[0]  # list of PIL Image
+    # export to MP4
     video_path = export_to_video(frames, fps=fps)
     return video_path, seed
         first_img = gr.Image(label="First frame", type="pil")
         last_img  = gr.Image(label="Last frame",  type="pil")
+    prompt         = gr.Textbox(label="Prompt", placeholder="A blue bird takes off…")
+    negative       = gr.Textbox(label="Negative prompt (optional)", placeholder="ugly, blurry")
     with gr.Accordion("Advanced parameters", open=False):
         steps      = gr.Slider(10, 50, value=30, step=1, label="Sampling steps")
         fps        = gr.Slider(4, 30, value=16, step=1, label="FPS (export)")
         seed       = gr.Number(value=-1, precision=0, label="Seed (-1 = random)")
+    run_btn   = gr.Button("Generate")
+    video     = gr.Video(label="Result (.mp4)")
     used_seed = gr.Number(label="Seed used", interactive=False)
     run_btn.click(