import os import random from pathlib import Path import numpy as np import torch from diffusers import AutoencoderKL, DDIMScheduler from PIL import Image from src.models.unet_2d_condition import UNet2DConditionModel from src.models.unet_3d_emo import EMOUNet3DConditionModel from src.models.whisper.audio2feature import load_audio_model from src.pipelines.pipeline_echomimicv2 import EchoMimicV2Pipeline from src.utils.util import save_videos_grid from src.models.pose_encoder import PoseEncoder from src.utils.dwpose_util import draw_pose_select_v2 from moviepy.editor import VideoFileClip, AudioFileClip import gradio as gr from datetime import datetime from torchao.quantization import quantize_, int8_weight_only import gc from src.inference import inference_pipeline from src.utils import load_config total_vram_in_gb = torch.cuda.get_device_properties(0).total_memory / 1073741824 print(f'\033[32mCUDA版本:{torch.version.cuda}\033[0m') print(f'\033[32mPytorch版本:{torch.__version__}\033[0m') print(f'\033[32m显卡型号:{torch.cuda.get_device_name()}\033[0m') print(f'\033[32m显存大小:{total_vram_in_gb:.2f}GB\033[0m') print(f'\033[32m精度:float16\033[0m') dtype = torch.float16 if torch.cuda.is_available(): device = "cuda" else: print("cuda not available, using cpu") device = "cpu" ffmpeg_path = os.getenv('FFMPEG_PATH') if ffmpeg_path is None: print("please download ffmpeg-static and export to FFMPEG_PATH. \nFor example: export FFMPEG_PATH=./ffmpeg-4.4-amd64-static") elif ffmpeg_path not in os.getenv('PATH'): print("add ffmpeg to path") os.environ["PATH"] = f"{ffmpeg_path}:{os.environ['PATH']}" # Initialize the model def initialize_model(): config = load_config('./configs/prompts/infer.yaml') device = "cuda" if torch.cuda.is_available() else "cpu" return config, device # Create the inference function def generate_animation(audio_file, reference_image, config, device): try: # Run inference output_video = inference_pipeline( audio_path=audio_file.name, reference_image_path=reference_image.name, config=config, device=device ) return output_video except Exception as e: return str(e) # Initialize the model config, device = initialize_model() # Create the Gradio interface with gr.Blocks(title="EchoMimicV2: Audio-Driven Human Animation") as demo: gr.Markdown(""" # EchoMimicV2: Audio-Driven Human Animation Upload a reference image and audio file to generate an animated video. """) with gr.Row(): with gr.Column(): audio_input = gr.Audio( label="Upload Audio", type="filepath" ) image_input = gr.Image( label="Upload Reference Image", type="filepath" ) generate_btn = gr.Button("Generate Animation") with gr.Column(): output_video = gr.Video(label="Generated Animation") generate_btn.click( fn=generate_animation, inputs=[audio_input, image_input, gr.State(config), gr.State(device)], outputs=output_video ) # Launch the app demo.launch()