Spaces:

GeradeHouse
/

Wan2.1-FLF2V

Paused

App Files Files Community

GeradeHouse commited on Apr 25

Commit

699b386

verified ·

1 Parent(s): 64a6a24

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -54

app.py CHANGED Viewed

@@ -1,54 +1,45 @@
 #!/usr/bin/env python
 """
 Gradio demo for Wan2.1 FLF2V – First & Last Frame → Video
-Loads the huge model lazily (only once), streams **all** tqdm bars
-(from HF downloads, shard loading, to denoising) into Gradio's UI,
-and outputs a direct File download for the generated video.
 """
 import os
-import tempfile
 import ftfy
 import numpy as np
 import torch
 import gradio as gr
 from diffusers import WanImageToVideoPipeline, AutoencoderKLWan
 from diffusers.utils import export_to_video
-from transformers import CLIPVisionModel, CLIPImageProcessor
-from PIL import Image
 # -----------------------------------------------------------------------------
 # CONFIG
 # -----------------------------------------------------------------------------
-MODEL_ID        = "Wan-AI/Wan2.1-FLF2V-14B-720P-diffusers"
-DTYPE           = torch.float16                     # or torch.bfloat16 on AMP-friendly cards
-MAX_AREA        = 1280 * 720                        # ≤720p
-DEFAULT_FRAMES  = 81                                # ~5s @16fps
 # -----------------------------------------------------------------------------
-# GLOBAL PIPELINE (lazy)
 # -----------------------------------------------------------------------------
 PIPE = None
 def load_pipeline():
-    """
-    Load the Wan2.1-FLF2V pipeline once, with fast processor,
-    CPU-offload for large models, and in half-precision.
-    """
-    # 1) full-precision CLIP encoder
     vision = CLIPVisionModel.from_pretrained(
         MODEL_ID, subfolder="image_encoder", torch_dtype=torch.float32
     )
-    # 2) fast CLIP image processor
     processor = CLIPImageProcessor.from_pretrained(
         MODEL_ID, subfolder="preprocessor", use_fast=True
     )
-    # 3) reduced-precision VAE
     vae = AutoencoderKLWan.from_pretrained(
         MODEL_ID, subfolder="vae", torch_dtype=DTYPE
     )
-    # 4) assemble pipeline
     pipe = WanImageToVideoPipeline.from_pretrained(
         MODEL_ID,
         vae=vae,
@@ -56,18 +47,13 @@ def load_pipeline():
         image_processor=processor,
         torch_dtype=DTYPE,
     )
-    # 5) offload to CPU/AutoDevice
     pipe.enable_model_cpu_offload()
-    # (we drop .enable_slicing() because it's unsupported here)
     return pipe.to("cuda" if torch.cuda.is_available() else "cpu")
 # -----------------------------------------------------------------------------
-# UTILS
 # -----------------------------------------------------------------------------
 def aspect_resize(img: Image.Image, max_area=MAX_AREA):
-    """
-    Resize while respecting the model's patch size (multiple of 8 * transformer patch).
-    """
     ar = img.height / img.width
     mod = PIPE.transformer.config.patch_size[1] * PIPE.vae_scale_factor_spatial
     h = (int(np.sqrt(max_area * ar)) // mod) * mod
@@ -75,15 +61,12 @@ def aspect_resize(img: Image.Image, max_area=MAX_AREA):
     return img.resize((w, h), Image.LANCZOS), h, w
 def center_crop_resize(img: Image.Image, h: int, w: int):
-    """
-    Center-crop + resize to exactly h×w.
-    """
     ratio = max(w / img.width, h / img.height)
     img2 = img.resize((round(img.width * ratio), round(img.height * ratio)), Image.LANCZOS)
     return TF.center_crop(img2, [h, w])
 # -----------------------------------------------------------------------------
-# GENERATION (with full tqdm → Gradio progress streaming)
 # -----------------------------------------------------------------------------
 def generate(
     first_frame: Image.Image,
@@ -98,27 +81,27 @@ def generate(
     progress=gr.Progress(track_tqdm=True),
 ):
     global PIPE
-    # lazy instantiate
     if PIPE is None:
         progress(0, desc="Loading pipeline…")
         PIPE = load_pipeline()
-    # seeding
     if seed == -1:
         seed = torch.seed()
     gen = torch.Generator(device=PIPE.device).manual_seed(seed)
-    # preprocess
-    progress(0, desc="Preprocessing…")
     frame1, h, w = aspect_resize(first_frame)
     if last_frame.size != frame1.size:
         last_frame = center_crop_resize(last_frame, h, w)
-    # inference (all tqdm inside will stream to UI)
     result = PIPE(
         image=frame1,
         last_image=last_frame,
-        prompt=whitespace_clean(basic_clean(prompt)),
         negative_prompt=negative_prompt or None,
         height=h,
         width=w,
@@ -126,42 +109,40 @@ def generate(
         num_inference_steps=steps,
         guidance_scale=guidance,
         generator=gen,
-        # no callback_steps here!
     )
-    frames = result.frames[0]  # list of PIL images
-    # export to MP4
-    progress(1.0, desc="Assembling video…")
     out_path = export_to_video(frames, fps=fps)
     return out_path, seed
 # -----------------------------------------------------------------------------
-# BUILD UI
 # -----------------------------------------------------------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("## Wan 2.1 FLF2V – First & Last Frame → Video (Diffusers)")
     with gr.Row():
         first_img = gr.Image(label="First frame", type="pil")
         last_img  = gr.Image(label="Last frame",  type="pil")
-    prompt        = gr.Textbox(label="Prompt", placeholder="A small blue bird takes off…")
-    negative      = gr.Textbox(label="Negative prompt (optional)", placeholder="ugly, blurry")
     with gr.Accordion("Advanced parameters", open=False):
-        steps      = gr.Slider(10, 50, value=30,  step=1,  label="Sampling steps")
-        guidance   = gr.Slider(0.0, 10.0, value=5.5,  step=0.1, label="Guidance scale")
         num_frames = gr.Slider(16, 129, value=DEFAULT_FRAMES, step=1, label="Frames")
-        fps        = gr.Slider(4, 30, value=16,  step=1, label="FPS")
-        seed       = gr.Number(value=-1, precision=0, label="Seed (-1 = random)")
-    run_btn = gr.Button("Generate")
-    # **File** component for direct download link:
     download = gr.File(label="Download video (.mp4)")
-    used_seed = gr.Number(label="Seed used", interactive=False)
-    # queue() for async + progress
     run_btn.click(
         fn=generate,
         inputs=[first_img, last_img, prompt, negative, steps, guidance, num_frames, seed, fps],
         outputs=[download, used_seed],
     )
-# MUST call .queue() to enable gr.Progress()
-demo.queue(concurrency_count=1).launch()

 #!/usr/bin/env python
 """
 Gradio demo for Wan2.1 FLF2V – First & Last Frame → Video
+Streams all HF-Hub & Diffusers tqdm bars into Gradio, caches the pipeline,
+and outputs a direct download link.
 """
 import os
 import ftfy
 import numpy as np
 import torch
 import gradio as gr
+from PIL import Image
+from transformers import CLIPVisionModel, CLIPImageProcessor
 from diffusers import WanImageToVideoPipeline, AutoencoderKLWan
 from diffusers.utils import export_to_video
+import torchvision.transforms.functional as TF
 # -----------------------------------------------------------------------------
 # CONFIG
 # -----------------------------------------------------------------------------
+MODEL_ID       = "Wan-AI/Wan2.1-FLF2V-14B-720P-diffusers"
+DTYPE          = torch.float16
+MAX_AREA       = 1280 * 720
+DEFAULT_FRAMES = 81
 # -----------------------------------------------------------------------------
+# GLOBAL CACHED PIPELINE
 # -----------------------------------------------------------------------------
 PIPE = None
 def load_pipeline():
+    """Load & shard the pipeline once (CPU offload + fast processor)."""
     vision = CLIPVisionModel.from_pretrained(
         MODEL_ID, subfolder="image_encoder", torch_dtype=torch.float32
     )
     processor = CLIPImageProcessor.from_pretrained(
         MODEL_ID, subfolder="preprocessor", use_fast=True
     )
     vae = AutoencoderKLWan.from_pretrained(
         MODEL_ID, subfolder="vae", torch_dtype=DTYPE
     )
     pipe = WanImageToVideoPipeline.from_pretrained(
         MODEL_ID,
         vae=vae,
         image_processor=processor,
         torch_dtype=DTYPE,
     )
     pipe.enable_model_cpu_offload()
     return pipe.to("cuda" if torch.cuda.is_available() else "cpu")
 # -----------------------------------------------------------------------------
+# IMAGE RESIZE HELPERS
 # -----------------------------------------------------------------------------
 def aspect_resize(img: Image.Image, max_area=MAX_AREA):
     ar = img.height / img.width
     mod = PIPE.transformer.config.patch_size[1] * PIPE.vae_scale_factor_spatial
     h = (int(np.sqrt(max_area * ar)) // mod) * mod
     return img.resize((w, h), Image.LANCZOS), h, w
 def center_crop_resize(img: Image.Image, h: int, w: int):
     ratio = max(w / img.width, h / img.height)
     img2 = img.resize((round(img.width * ratio), round(img.height * ratio)), Image.LANCZOS)
     return TF.center_crop(img2, [h, w])
 # -----------------------------------------------------------------------------
+# GENERATION FUNCTION (with tqdm streaming)
 # -----------------------------------------------------------------------------
 def generate(
     first_frame: Image.Image,
     progress=gr.Progress(track_tqdm=True),
 ):
     global PIPE
+    # Lazy load pipeline
     if PIPE is None:
         progress(0, desc="Loading pipeline…")
         PIPE = load_pipeline()
+    # Seed
     if seed == -1:
         seed = torch.seed()
     gen = torch.Generator(device=PIPE.device).manual_seed(seed)
+    # Preprocess
+    progress(0, desc="Preprocessing frames…")
     frame1, h, w = aspect_resize(first_frame)
     if last_frame.size != frame1.size:
         last_frame = center_crop_resize(last_frame, h, w)
+    # Inference (tqdm bars streamed)
     result = PIPE(
         image=frame1,
         last_image=last_frame,
+        prompt=ftfy.fix_text(prompt),
         negative_prompt=negative_prompt or None,
         height=h,
         width=w,
         num_inference_steps=steps,
         guidance_scale=guidance,
         generator=gen,
     )
+    frames = result.frames[0]
+    # Export
+    progress(1.0, desc="Exporting video…")
     out_path = export_to_video(frames, fps=fps)
     return out_path, seed
 # -----------------------------------------------------------------------------
+# GRADIO UI
 # -----------------------------------------------------------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("## Wan2.1 FLF2V – First & Last Frame → Video")
     with gr.Row():
         first_img = gr.Image(label="First frame", type="pil")
         last_img  = gr.Image(label="Last frame",  type="pil")
+    prompt   = gr.Textbox(label="Prompt")
+    negative = gr.Textbox(label="Negative prompt (optional)")
     with gr.Accordion("Advanced parameters", open=False):
+        steps      = gr.Slider(10, 50, value=30, step=1, label="Steps")
+        guidance   = gr.Slider(0.0, 10.0, value=5.5, step=0.1, label="Guidance")
         num_frames = gr.Slider(16, 129, value=DEFAULT_FRAMES, step=1, label="Frames")
+        fps        = gr.Slider(4, 30, value=16, step=1, label="FPS")
+        seed       = gr.Number(value=-1, precision=0, label="Seed")
+    run_btn  = gr.Button("Generate")
     download = gr.File(label="Download video (.mp4)")
+    used_seed= gr.Number(label="Seed used", interactive=False)
     run_btn.click(
         fn=generate,
         inputs=[first_img, last_img, prompt, negative, steps, guidance, num_frames, seed, fps],
         outputs=[download, used_seed],
+        concurrency_limit=1
     )
+# **Enable queuing** (uses default_concurrency_limit=1 under the hood)
+demo.queue().launch()