echomimic-v2 / app.py
arsh121's picture
Upload 4 files
f5034fe verified
raw
history blame
3.35 kB
import os
import random
from pathlib import Path
import numpy as np
import torch
from diffusers import AutoencoderKL, DDIMScheduler
from PIL import Image
from src.models.unet_2d_condition import UNet2DConditionModel
from src.models.unet_3d_emo import EMOUNet3DConditionModel
from src.models.whisper.audio2feature import load_audio_model
from src.pipelines.pipeline_echomimicv2 import EchoMimicV2Pipeline
from src.utils.util import save_videos_grid
from src.models.pose_encoder import PoseEncoder
from src.utils.dwpose_util import draw_pose_select_v2
from moviepy.editor import VideoFileClip, AudioFileClip
import gradio as gr
from datetime import datetime
from torchao.quantization import quantize_, int8_weight_only
import gc
from src.inference import inference_pipeline
from src.utils import load_config
total_vram_in_gb = torch.cuda.get_device_properties(0).total_memory / 1073741824
print(f'\033[32mCUDA版本:{torch.version.cuda}\033[0m')
print(f'\033[32mPytorch版本:{torch.__version__}\033[0m')
print(f'\033[32m显卡型号:{torch.cuda.get_device_name()}\033[0m')
print(f'\033[32m显存大小:{total_vram_in_gb:.2f}GB\033[0m')
print(f'\033[32m精度:float16\033[0m')
dtype = torch.float16
if torch.cuda.is_available():
device = "cuda"
else:
print("cuda not available, using cpu")
device = "cpu"
ffmpeg_path = os.getenv('FFMPEG_PATH')
if ffmpeg_path is None:
print("please download ffmpeg-static and export to FFMPEG_PATH. \nFor example: export FFMPEG_PATH=./ffmpeg-4.4-amd64-static")
elif ffmpeg_path not in os.getenv('PATH'):
print("add ffmpeg to path")
os.environ["PATH"] = f"{ffmpeg_path}:{os.environ['PATH']}"
# Initialize the model
def initialize_model():
config = load_config('./configs/prompts/infer.yaml')
device = "cuda" if torch.cuda.is_available() else "cpu"
return config, device
# Create the inference function
def generate_animation(audio_file, reference_image, config, device):
try:
# Run inference
output_video = inference_pipeline(
audio_path=audio_file.name,
reference_image_path=reference_image.name,
config=config,
device=device
)
return output_video
except Exception as e:
return str(e)
# Initialize the model
config, device = initialize_model()
# Create the Gradio interface
with gr.Blocks(title="EchoMimicV2: Audio-Driven Human Animation") as demo:
gr.Markdown("""
# EchoMimicV2: Audio-Driven Human Animation
Upload a reference image and audio file to generate an animated video.
""")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
label="Upload Audio",
type="filepath"
)
image_input = gr.Image(
label="Upload Reference Image",
type="filepath"
)
generate_btn = gr.Button("Generate Animation")
with gr.Column():
output_video = gr.Video(label="Generated Animation")
generate_btn.click(
fn=generate_animation,
inputs=[audio_input, image_input, gr.State(config), gr.State(device)],
outputs=output_video
)
# Launch the app
demo.launch()