#!/usr/bin/env python
"""
Gradio demo for Wan2.1 FLF2V – First & Last Frame → Video
Streams all HF-Hub & Diffusers tqdm bars, caches the model, 
and provides a direct download link for the MP4.
"""

import ftfy
import numpy as np
import torch
import gradio as gr
from PIL import Image
from transformers import CLIPVisionModel, CLIPImageProcessor
from diffusers import WanImageToVideoPipeline, AutoencoderKLWan
from diffusers.utils import export_to_video
import torchvision.transforms.functional as TF

# -----------------------------------------------------------------------------
# CONFIG
# -----------------------------------------------------------------------------
MODEL_ID       = "Wan-AI/Wan2.1-FLF2V-14B-720P-diffusers"
DTYPE          = torch.float16
MAX_AREA       = 1280 * 720
DEFAULT_FRAMES = 81

# -----------------------------------------------------------------------------
# GLOBAL CACHED PIPELINE
# -----------------------------------------------------------------------------
PIPE = None

def load_pipeline():
    """Load & cache the pipeline (once)."""
    # 1) CLIP vision encoder (fp32)
    vision = CLIPVisionModel.from_pretrained(
        MODEL_ID, subfolder="image_encoder", torch_dtype=torch.float32
    )
    # 2) fast processor
    processor = CLIPImageProcessor.from_pretrained(
        MODEL_ID, subfolder="image_processor", use_fast=True
    )
    # 3) VAE (half precision)
    vae = AutoencoderKLWan.from_pretrained(
        MODEL_ID, subfolder="vae", torch_dtype=DTYPE
    )
    # 4) pipeline assembly
    pipe = WanImageToVideoPipeline.from_pretrained(
        MODEL_ID,
        vae=vae,
        image_encoder=vision,
        image_processor=processor,
        torch_dtype=DTYPE,
    )
    # 5) CPU offload for large models
    pipe.enable_model_cpu_offload()
    return pipe.to("cuda" if torch.cuda.is_available() else "cpu")

# -----------------------------------------------------------------------------
# IMAGE RESIZE HELPERS
# -----------------------------------------------------------------------------
def aspect_resize(img: Image.Image, max_area=MAX_AREA):
    ar = img.height / img.width
    mod = PIPE.transformer.config.patch_size[1] * PIPE.vae_scale_factor_spatial
    h = (int(np.sqrt(max_area * ar)) // mod) * mod
    w = (int(np.sqrt(max_area / ar)) // mod) * mod
    return img.resize((w, h), Image.LANCZOS), h, w

def center_crop_resize(img: Image.Image, h: int, w: int):
    ratio = max(w / img.width, h / img.height)
    img2 = img.resize((round(img.width * ratio), round(img.height * ratio)), Image.LANCZOS)
    return TF.center_crop(img2, [h, w])

# -----------------------------------------------------------------------------
# GENERATION (stream all tqdm → Gradio)
# -----------------------------------------------------------------------------
def generate(
    first_frame: Image.Image,
    last_frame: Image.Image,
    prompt: str,
    negative_prompt: str,
    steps: int,
    guidance: float,
    num_frames: int,
    seed: int,
    fps: int,
    progress=gr.Progress(track_tqdm=True),
):
    global PIPE
    # lazy load
    if PIPE is None:
        progress(0, desc="Loading model…")
        PIPE = load_pipeline()

    # seed
    if seed == -1:
        seed = torch.seed()
    gen = torch.Generator(device=PIPE.device).manual_seed(seed)

    # preprocess
    progress(0, desc="Preprocessing…")
    frame1, h, w = aspect_resize(first_frame)
    if last_frame.size != frame1.size:
        last_frame = center_crop_resize(last_frame, h, w)

    # inference (all tqdm bars appear in progress)
    result = PIPE(
        image=frame1,
        last_image=last_frame,
        prompt=ftfy.fix_text(prompt),
        negative_prompt=negative_prompt or None,
        height=h,
        width=w,
        num_frames=num_frames,
        num_inference_steps=steps,
        guidance_scale=guidance,
        generator=gen,
    )
    frames = result.frames[0]

    # export
    progress(1.0, desc="Exporting video…")
    out_path = export_to_video(frames, fps=fps)
    return out_path, seed

# -----------------------------------------------------------------------------
# GRADIO UI
# -----------------------------------------------------------------------------
with gr.Blocks() as demo:
    gr.Markdown("## Wan2.1 FLF2V – First & Last Frame → Video")

    with gr.Row():
        first_img = gr.Image(label="First frame", type="pil")
        last_img  = gr.Image(label="Last frame",  type="pil")

    prompt   = gr.Textbox(label="Prompt")
    negative = gr.Textbox(label="Negative prompt (optional)")

    with gr.Accordion("Advanced parameters", open=False):
        steps      = gr.Slider(10, 50, value=30, step=1,  label="Steps")
        guidance   = gr.Slider(0.0, 10.0, value=5.5, step=0.1,label="Guidance")
        num_frames = gr.Slider(16, 129, value=DEFAULT_FRAMES, step=1,label="Frames")
        fps        = gr.Slider(4, 30, value=16, step=1,           label="FPS")
        seed       = gr.Number(value=-1, precision=0,           label="Seed")

    run_btn  = gr.Button("Generate")
    download = gr.File(label="Download video (.mp4)")
    used_seed= gr.Number(label="Seed used", interactive=False)

    run_btn.click(
        fn=generate,
        inputs=[first_img, last_img, prompt, negative, steps, guidance, num_frames, seed, fps],
        outputs=[download, used_seed],
        concurrency_limit=1
    )

# enable progress streaming
demo.queue().launch()