Spaces:

GeradeHouse
/

Wan2.1-FLF2V

Paused

App Files Files Community

GeradeHouse commited on Apr 25

Commit

d8d26ca

verified ·

1 Parent(s): fa6cab1

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -31

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ Gradio demo for Wan2.1-FLF2V – First & Last Frame → Video
 """
 import os
-# Persist the HF cache between launches
 os.environ["HF_HOME"] = "/mnt/data/huggingface"
 import torch
@@ -12,7 +12,7 @@ import numpy as np
 import gradio as gr
 from PIL import Image
 import torchvision.transforms.functional as TF
-from transformers import CLIPVisionModel, CLIPProcessor
 from diffusers import WanImageToVideoPipeline, AutoencoderKLWan
 from diffusers.utils import export_to_video
@@ -20,7 +20,7 @@ from diffusers.utils import export_to_video
 # CONFIGURATION
 # -----------------------------------------------------------------------------
 MODEL_ID       = "Wan-AI/Wan2.1-FLF2V-14B-720P-diffusers"
-DTYPE          = torch.float16                # or torch.bfloat16
 MAX_AREA       = 1280 * 720
 DEFAULT_FRAMES = 81
@@ -28,36 +28,36 @@ DEFAULT_FRAMES = 81
 # PIPELINE LOADING (ONCE)
 # -----------------------------------------------------------------------------
 def load_pipeline():
-    # 1) CLIP vision encoder in fp32
     clip_encoder = CLIPVisionModel.from_pretrained(
         MODEL_ID, subfolder="image_encoder", torch_dtype=torch.float32
     )
-    # 2) VAE in reduced precision
     vae = AutoencoderKLWan.from_pretrained(
         MODEL_ID, subfolder="vae", torch_dtype=DTYPE
     )
-    # 3) CLIPProcessor (inherits ProcessorMixin) from a standard CLIP repo
-    clip_processor = CLIPProcessor.from_pretrained(
         "openai/clip-vit-base-patch32", use_fast=True
     )
-    # 4) Build Wan2video pipeline with balanced device_map
     pipe = WanImageToVideoPipeline.from_pretrained(
         MODEL_ID,
         image_encoder=clip_encoder,
         vae=vae,
-        image_processor=clip_processor,
         torch_dtype=DTYPE,
         device_map="balanced",
     )
-    # 5) Reduce VAE peaks and offload other modules
     try:
         pipe.vae.enable_slicing()
     except AttributeError:
         pass
-    pipe.enable_model_cpu_offload()
     return pipe
-PIPE = load_pipeline()  # single load
 # -----------------------------------------------------------------------------
 # IMAGE RESIZE HELPERS
@@ -78,37 +78,37 @@ def center_crop_resize(img: Image.Image, h: int, w: int):
     return TF.center_crop(img, [h, w])
 # -----------------------------------------------------------------------------
-# GENERATION (STREAMING PROGRESS)
 # -----------------------------------------------------------------------------
 def generate(
-    first_frame:  Image.Image,
-    last_frame:   Image.Image,
-    prompt:       str,
-    negative:     str,
-    steps:        int,
-    guidance:     float,
-    num_frames:   int,
-    seed:         int,
-    fps:          int,
-    progress=     gr.Progress()
 ):
-    # Seed
     if seed == -1:
         seed = torch.seed()
     gen = torch.Generator(device=PIPE.device).manual_seed(seed)
-    # Preprocessing
     progress(0, steps, desc="Preprocessing images")
     f0, h, w = aspect_resize(first_frame)
     if last_frame.size != f0.size:
         last_frame = center_crop_resize(last_frame, h, w)
-    # Callback for each denoising step
     def cb(step, timestep, latents):
         progress(step, steps, desc=f"Inference step {step}/{steps}")
-    # Run pipeline
-    output = PIPE(
         image=f0,
         last_image=last_frame,
         prompt=prompt,
@@ -122,12 +122,12 @@ def generate(
         callback=cb
     )
-    # Export
-    video_path = export_to_video(output.frames[0], fps=fps)
     return video_path, seed
 # -----------------------------------------------------------------------------
-# GRADIO UI
 # -----------------------------------------------------------------------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("## Wan2.1 FLF2V – First & Last Frame → Video")

 """
 import os
+# Persist HF cache between launches
 os.environ["HF_HOME"] = "/mnt/data/huggingface"
 import torch
 import gradio as gr
 from PIL import Image
 import torchvision.transforms.functional as TF
+from transformers import CLIPVisionModel, CLIPImageProcessor
 from diffusers import WanImageToVideoPipeline, AutoencoderKLWan
 from diffusers.utils import export_to_video
 # CONFIGURATION
 # -----------------------------------------------------------------------------
 MODEL_ID       = "Wan-AI/Wan2.1-FLF2V-14B-720P-diffusers"
+DTYPE          = torch.float16
 MAX_AREA       = 1280 * 720
 DEFAULT_FRAMES = 81
 # PIPELINE LOADING (ONCE)
 # -----------------------------------------------------------------------------
 def load_pipeline():
+    # 1) Vision encoder (fp32)
     clip_encoder = CLIPVisionModel.from_pretrained(
         MODEL_ID, subfolder="image_encoder", torch_dtype=torch.float32
     )
+    # 2) VAE (reduced precision)
     vae = AutoencoderKLWan.from_pretrained(
         MODEL_ID, subfolder="vae", torch_dtype=DTYPE
     )
+    # 3) CLIPImageProcessor (exactly the type Wan expects)
+    img_processor = CLIPImageProcessor.from_pretrained(
         "openai/clip-vit-base-patch32", use_fast=True
     )
+    # 4) Load the Wan‐to‐Video pipeline, balanced across GPU & CPU
     pipe = WanImageToVideoPipeline.from_pretrained(
         MODEL_ID,
         image_encoder=clip_encoder,
         vae=vae,
+        image_processor=img_processor,
         torch_dtype=DTYPE,
         device_map="balanced",
     )
+    # 5) Slice the VAE to cut VRAM spikes
     try:
         pipe.vae.enable_slicing()
     except AttributeError:
         pass
     return pipe
+# instantiate once
+PIPE = load_pipeline()
 # -----------------------------------------------------------------------------
 # IMAGE RESIZE HELPERS
     return TF.center_crop(img, [h, w])
 # -----------------------------------------------------------------------------
+# GENERATION (STREAMING)
 # -----------------------------------------------------------------------------
 def generate(
+    first_frame: Image.Image,
+    last_frame:  Image.Image,
+    prompt:      str,
+    negative:    str,
+    steps:       int,
+    guidance:    float,
+    num_frames:  int,
+    seed:        int,
+    fps:         int,
+    progress=    gr.Progress()
 ):
+    # Seed management
     if seed == -1:
         seed = torch.seed()
     gen = torch.Generator(device=PIPE.device).manual_seed(seed)
+    # Preprocessing update
     progress(0, steps, desc="Preprocessing images")
     f0, h, w = aspect_resize(first_frame)
     if last_frame.size != f0.size:
         last_frame = center_crop_resize(last_frame, h, w)
+    # Step callback
     def cb(step, timestep, latents):
         progress(step, steps, desc=f"Inference step {step}/{steps}")
+    # Run the pipeline
+    out = PIPE(
         image=f0,
         last_image=last_frame,
         prompt=prompt,
         callback=cb
     )
+    # Export video
+    video_path = export_to_video(out.frames[0], fps=fps)
     return video_path, seed
 # -----------------------------------------------------------------------------
+# GRADIO APP
 # -----------------------------------------------------------------------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("## Wan2.1 FLF2V – First & Last Frame → Video")