Spaces:

GeradeHouse
/

Wan2.1-FLF2V

Paused

App Files Files Community

GeradeHouse commited on Apr 25

Commit

2c7ebd6

verified ·

1 Parent(s): f956532

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -94

app.py CHANGED Viewed

@@ -1,112 +1,113 @@
 #!/usr/bin/env python
 """
 Gradio demo for Wan2.1 FLF2V – First & Last Frame → Video
-Streams all HF-Hub & Diffusers tqdm bars, caches the model,
-and provides a direct download link for the MP4.
 """
-import ftfy
-import numpy as np
 import torch
 import gradio as gr
-from PIL import Image
-from transformers import CLIPVisionModel, CLIPProcessor
 from diffusers import WanImageToVideoPipeline, AutoencoderKLWan
 from diffusers.utils import export_to_video
 import torchvision.transforms.functional as TF
-# -----------------------------------------------------------------------------
 # CONFIG
-# -----------------------------------------------------------------------------
-MODEL_ID       = "Wan-AI/Wan2.1-FLF2V-14B-720P-diffusers"
-DTYPE          = torch.float16
-MAX_AREA       = 1280 * 720
-DEFAULT_FRAMES = 81
-# -----------------------------------------------------------------------------
-# GLOBAL CACHED PIPELINE
-# -----------------------------------------------------------------------------
-PIPE = None
 def load_pipeline():
-    """Load & cache the pipeline (once)."""
-    # 1) CLIP vision encoder (fp32)
-    vision = CLIPVisionModel.from_pretrained(
         MODEL_ID, subfolder="image_encoder", torch_dtype=torch.float32
     )
-    # 2) unified CLIP processor (fast Rust-backed+tokenizer stub)
-    processor = CLIPProcessor.from_pretrained(
-        MODEL_ID, subfolder="image_processor", use_fast=True
-    )
-    # 3) VAE (half precision)
     vae = AutoencoderKLWan.from_pretrained(
         MODEL_ID, subfolder="vae", torch_dtype=DTYPE
     )
-    # 4) assemble pipeline
     pipe = WanImageToVideoPipeline.from_pretrained(
         MODEL_ID,
         vae=vae,
-        image_encoder=vision,
-        image_processor=processor,
         torch_dtype=DTYPE,
     )
-    # 5) CPU offload for large models
     pipe.enable_model_cpu_offload()
-    # return on correct device
-    return pipe.to("cuda" if torch.cuda.is_available() else "cpu")
-# -----------------------------------------------------------------------------
-# IMAGE RESIZE HELPERS
-# -----------------------------------------------------------------------------
 def aspect_resize(img: Image.Image, max_area=MAX_AREA):
-    ar = img.height / img.width
-    mod = PIPE.transformer.config.patch_size[1] * PIPE.vae_scale_factor_spatial
-    h = (int(np.sqrt(max_area * ar)) // mod) * mod
-    w = (int(np.sqrt(max_area / ar)) // mod) * mod
     return img.resize((w, h), Image.LANCZOS), h, w
-def center_crop_resize(img: Image.Image, h: int, w: int):
     ratio = max(w / img.width, h / img.height)
-    img2 = img.resize((round(img.width * ratio), round(img.height * ratio)), Image.LANCZOS)
-    return TF.center_crop(img2, [h, w])
-# -----------------------------------------------------------------------------
-# GENERATION (streams all tqdm → Gradio)
-# -----------------------------------------------------------------------------
 def generate(
     first_frame: Image.Image,
-    last_frame: Image.Image,
-    prompt: str,
     negative_prompt: str,
-    steps: int,
-    guidance: float,
-    num_frames: int,
-    seed: int,
-    fps: int,
-    progress=gr.Progress(track_tqdm=True),
 ):
-    global PIPE
-    # lazy load once
-    if PIPE is None:
-        progress(0, desc="Loading model…")
-        PIPE = load_pipeline()
-    # ensure reproducibility
     if seed == -1:
         seed = torch.seed()
-    gen = torch.Generator(device=PIPE.device).manual_seed(seed)
-    # preprocess
-    progress(0, desc="Preprocessing frames…")
-    frame1, h, w = aspect_resize(first_frame)
-    if last_frame.size != frame1.size:
         last_frame = center_crop_resize(last_frame, h, w)
-    # inference (all internal tqdm bars streamed)
-    result = PIPE(
-        image=frame1,
         last_image=last_frame,
-        prompt=ftfy.fix_text(prompt),
         negative_prompt=negative_prompt or None,
         height=h,
         width=w,
@@ -114,44 +115,45 @@ def generate(
         num_inference_steps=steps,
         guidance_scale=guidance,
         generator=gen,
     )
-    frames = result.frames[0]
-    # export to MP4
-    progress(1.0, desc="Exporting video…")
-    out_path = export_to_video(frames, fps=fps)
-    return out_path, seed
-# -----------------------------------------------------------------------------
-# GRADIO UI
-# -----------------------------------------------------------------------------
-with gr.Blocks() as demo:
-    gr.Markdown("## Wan2.1 FLF2V – First & Last Frame → Video")
     with gr.Row():
         first_img = gr.Image(label="First frame", type="pil")
         last_img  = gr.Image(label="Last frame",  type="pil")
-    prompt   = gr.Textbox(label="Prompt")
-    negative = gr.Textbox(label="Negative prompt (optional)")
     with gr.Accordion("Advanced parameters", open=False):
-        steps      = gr.Slider(10, 50, value=30,    step=1, label="Steps")
-        guidance   = gr.Slider(0.0, 10.0, value=5.5, step=0.1,label="Guidance")
-        num_frames = gr.Slider(16, 129, value=DEFAULT_FRAMES, step=1,label="Frames")
-        fps        = gr.Slider(4, 30, value=16,     step=1, label="FPS")
-        seed       = gr.Number(value=-1, precision=0,          label="Seed")
-    run_btn  = gr.Button("Generate")
-    download = gr.File(label="Download video (.mp4)")
-    used_seed= gr.Number(label="Seed used", interactive=False)
     run_btn.click(
         fn=generate,
-        inputs=[first_img, last_img, prompt, negative, steps, guidance, num_frames, seed, fps],
-        outputs=[download, used_seed],
-        concurrency_limit=1
     )
-# enable queue + tqdm streaming
-demo.queue().launch()

 #!/usr/bin/env python
 """
 Gradio demo for Wan2.1 FLF2V – First & Last Frame → Video
 """
+import os
 import torch
+import numpy as np
 import gradio as gr
 from diffusers import WanImageToVideoPipeline, AutoencoderKLWan
+from transformers import CLIPProcessor, CLIPVisionModel
 from diffusers.utils import export_to_video
+from PIL import Image
 import torchvision.transforms.functional as TF
+# ----------------------------------------------------------------------
 # CONFIG
+# ----------------------------------------------------------------------
+MODEL_ID        = "Wan-AI/Wan2.1-FLF2V-14B-720P-diffusers"
+DTYPE           = torch.float16                # switch to torch.bfloat16 if you have AMP-friendly GPU
+MAX_AREA        = 1280 * 720                   # ≤ 720p
+DEFAULT_FRAMES  = 81                            # ~5s @ 16fps
+# ----------------------------------------------------------------------
+# PIPELINE LOADING (once)
+# ----------------------------------------------------------------------
 def load_pipeline():
+    # 1) image encoder in fp32 for stability
+    image_encoder = CLIPVisionModel.from_pretrained(
         MODEL_ID, subfolder="image_encoder", torch_dtype=torch.float32
     )
+    # 2) VAE in reduced precision
     vae = AutoencoderKLWan.from_pretrained(
         MODEL_ID, subfolder="vae", torch_dtype=DTYPE
     )
+    # 3) use the unified CLIPProcessor (inherits ProcessorMixin) in fast mode
+    processor = CLIPProcessor.from_pretrained(MODEL_ID, use_fast=True)
+    # 4) assemble pipeline, overriding the default processor
     pipe = WanImageToVideoPipeline.from_pretrained(
         MODEL_ID,
         vae=vae,
+        image_encoder=image_encoder,
+        processor=processor,
         torch_dtype=DTYPE,
     )
+    # 5) offload to CPU / reduce footprint
     pipe.enable_model_cpu_offload()
+    # 6) safe VAE slicing if available
+    try:
+        pipe.vae.enable_slicing()
+    except (AttributeError, TypeError):
+        pass
+    return pipe
+pipe = load_pipeline()
+# ----------------------------------------------------------------------
+# IMAGE RESIZING HELPERS
+# ----------------------------------------------------------------------
 def aspect_resize(img: Image.Image, max_area=MAX_AREA):
+    ar  = img.height / img.width
+    # align to VAE & transformer patch grid
+    mod = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
+    h   = round(np.sqrt(max_area * ar)) // mod * mod
+    w   = round(np.sqrt(max_area / ar)) // mod * mod
     return img.resize((w, h), Image.LANCZOS), h, w
+def center_crop_resize(img: Image.Image, h, w):
     ratio = max(w / img.width, h / img.height)
+    img   = img.resize(
+        (round(img.width * ratio), round(img.height * ratio)), Image.LANCZOS
+    )
+    return TF.center_crop(img, [h, w])
+# ----------------------------------------------------------------------
+# GENERATION FUNCTION
+# ----------------------------------------------------------------------
 def generate(
     first_frame: Image.Image,
+    last_frame:  Image.Image,
+    prompt:      str,
     negative_prompt: str,
+    steps:       int,
+    guidance:    float,
+    num_frames:  int,
+    seed:        int,
+    fps:         int,
 ):
+    # randomize seed if requested
     if seed == -1:
         seed = torch.seed()
+    gen = torch.Generator(device=pipe.device).manual_seed(seed)
+    # preprocess inputs
+    first_frame, h, w = aspect_resize(first_frame)
+    if last_frame.size != first_frame.size:
         last_frame = center_crop_resize(last_frame, h, w)
+    # set up streaming progress
+    progress = gr.Progress(track_tqdm=True)
+    # run the pipeline, streaming progress every step
+    result = pipe(
+        image=first_frame,
         last_image=last_frame,
+        prompt=prompt,
         negative_prompt=negative_prompt or None,
         height=h,
         width=w,
         num_inference_steps=steps,
         guidance_scale=guidance,
         generator=gen,
+        callback=progress,
+        callback_steps=1,
     )
+    # export to video and return path + seed used
+    frames     = result.frames[0]
+    video_path = export_to_video(frames, fps=fps)
+    return video_path, seed
+# ----------------------------------------------------------------------
+# GRADIO APP
+# ----------------------------------------------------------------------
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("## Wan 2.1 FLF2V – First & Last Frame → Video")
     with gr.Row():
         first_img = gr.Image(label="First frame", type="pil")
         last_img  = gr.Image(label="Last frame",  type="pil")
+    prompt         = gr.Textbox(label="Prompt", placeholder="A blue bird takes off…")
+    negative       = gr.Textbox(label="Negative prompt (optional)", placeholder="ugly, blurry")
     with gr.Accordion("Advanced parameters", open=False):
+        steps      = gr.Slider(10, 50, value=30, label="Sampling steps")
+        guidance   = gr.Slider(0.0, 10.0, value=5.5, step=0.1, label="Guidance scale")
+        num_frames = gr.Slider(16, 129, value=DEFAULT_FRAMES, label="Frames")
+        fps        = gr.Slider(4, 30, value=16, label="FPS (export)")
+        seed_input = gr.Number(value=-1, precision=0, label="Seed (-1 = random)")
+    run_btn     = gr.Button("Generate")
+    video_out   = gr.Video(label="Result (.mp4)")
+    used_seed   = gr.Number(label="Seed used", interactive=False)
     run_btn.click(
         fn=generate,
+        inputs=[first_img, last_img, prompt, negative, steps, guidance, num_frames, seed_input, fps],
+        outputs=[video_out, used_seed],
+        show_progress=True,             # hook into Gradio’s built-in progress UI
     )
+    demo.queue()                       # serialize GPU calls
+    demo.launch()