Spaces:
Runtime error
Runtime error
Upload 4 files
Browse files- .gitignore +40 -0
- README.md +58 -13
- app.py +96 -0
- requirements.txt +36 -0
.gitignore
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Dataset and assets
|
2 |
+
EMTD_dataset/
|
3 |
+
assets/
|
4 |
+
outputs/
|
5 |
+
|
6 |
+
# Python
|
7 |
+
__pycache__/
|
8 |
+
*.py[cod]
|
9 |
+
*$py.class
|
10 |
+
*.so
|
11 |
+
.Python
|
12 |
+
env/
|
13 |
+
build/
|
14 |
+
develop-eggs/
|
15 |
+
dist/
|
16 |
+
downloads/
|
17 |
+
eggs/
|
18 |
+
.eggs/
|
19 |
+
lib/
|
20 |
+
lib64/
|
21 |
+
parts/
|
22 |
+
sdist/
|
23 |
+
var/
|
24 |
+
wheels/
|
25 |
+
*.egg-info/
|
26 |
+
.installed.cfg
|
27 |
+
*.egg
|
28 |
+
|
29 |
+
# Model files (these will be downloaded by the Space)
|
30 |
+
pretrained_weights/
|
31 |
+
|
32 |
+
# IDE
|
33 |
+
.idea/
|
34 |
+
.vscode/
|
35 |
+
*.swp
|
36 |
+
*.swo
|
37 |
+
|
38 |
+
# OS
|
39 |
+
.DS_Store
|
40 |
+
Thumbs.db
|
README.md
CHANGED
@@ -1,13 +1,58 @@
|
|
1 |
-
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version:
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: "EchoMimicV2: Audio-Driven Human Animation"
|
3 |
+
emoji: "🎬"
|
4 |
+
colorFrom: "blue"
|
5 |
+
colorTo: "purple"
|
6 |
+
sdk: "gradio"
|
7 |
+
sdk_version: "4.19.2"
|
8 |
+
app_file: "app.py"
|
9 |
+
pinned: false
|
10 |
+
---
|
11 |
+
|
12 |
+
# EchoMimicV2: Audio-Driven Human Animation
|
13 |
+
|
14 |
+
This Space provides a web interface for EchoMimicV2, an AI model that generates human animations from audio input and a reference image.
|
15 |
+
|
16 |
+
## How to Use
|
17 |
+
|
18 |
+
1. Upload an audio file (WAV format recommended)
|
19 |
+
2. Upload a reference image of a person
|
20 |
+
3. Click "Generate Animation" to create the video
|
21 |
+
4. Wait for the processing to complete
|
22 |
+
5. Download the generated video
|
23 |
+
|
24 |
+
## Features
|
25 |
+
|
26 |
+
- Audio-driven human animation
|
27 |
+
- Support for both English and Chinese audio
|
28 |
+
- High-quality video generation
|
29 |
+
- Realistic facial expressions and body movements
|
30 |
+
|
31 |
+
## Model Information
|
32 |
+
|
33 |
+
This Space uses the EchoMimicV2 model from [BadToBest/EchoMimicV2](https://huggingface.co/BadToBest/EchoMimicV2).
|
34 |
+
|
35 |
+
## Requirements
|
36 |
+
|
37 |
+
- Audio file (WAV format recommended)
|
38 |
+
- Reference image of a person (clear face visible)
|
39 |
+
- Processing time varies based on input length
|
40 |
+
|
41 |
+
## Limitations
|
42 |
+
|
43 |
+
- Best results with clear audio and front-facing reference images
|
44 |
+
- Processing time depends on video length
|
45 |
+
- GPU memory requirements for optimal performance
|
46 |
+
|
47 |
+
## Citation
|
48 |
+
|
49 |
+
If you use this model, please cite:
|
50 |
+
```
|
51 |
+
@misc{meng2024echomimicv2,
|
52 |
+
title={EchoMimicV2: Towards Striking, Simplified, and Semi-Body Human Animation},
|
53 |
+
author={Rang Meng, Xingyu Zhang, Yuming Li, Chenguang Ma},
|
54 |
+
year={2024},
|
55 |
+
eprint={2411.10061},
|
56 |
+
archivePrefix={arXiv}
|
57 |
+
}
|
58 |
+
```
|
app.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import random
|
3 |
+
from pathlib import Path
|
4 |
+
import numpy as np
|
5 |
+
import torch
|
6 |
+
from diffusers import AutoencoderKL, DDIMScheduler
|
7 |
+
from PIL import Image
|
8 |
+
from src.models.unet_2d_condition import UNet2DConditionModel
|
9 |
+
from src.models.unet_3d_emo import EMOUNet3DConditionModel
|
10 |
+
from src.models.whisper.audio2feature import load_audio_model
|
11 |
+
from src.pipelines.pipeline_echomimicv2 import EchoMimicV2Pipeline
|
12 |
+
from src.utils.util import save_videos_grid
|
13 |
+
from src.models.pose_encoder import PoseEncoder
|
14 |
+
from src.utils.dwpose_util import draw_pose_select_v2
|
15 |
+
from moviepy.editor import VideoFileClip, AudioFileClip
|
16 |
+
|
17 |
+
import gradio as gr
|
18 |
+
from datetime import datetime
|
19 |
+
from torchao.quantization import quantize_, int8_weight_only
|
20 |
+
import gc
|
21 |
+
from src.inference import inference_pipeline
|
22 |
+
from src.utils import load_config
|
23 |
+
|
24 |
+
total_vram_in_gb = torch.cuda.get_device_properties(0).total_memory / 1073741824
|
25 |
+
print(f'\033[32mCUDA版本:{torch.version.cuda}\033[0m')
|
26 |
+
print(f'\033[32mPytorch版本:{torch.__version__}\033[0m')
|
27 |
+
print(f'\033[32m显卡型号:{torch.cuda.get_device_name()}\033[0m')
|
28 |
+
print(f'\033[32m显存大小:{total_vram_in_gb:.2f}GB\033[0m')
|
29 |
+
print(f'\033[32m精度:float16\033[0m')
|
30 |
+
dtype = torch.float16
|
31 |
+
if torch.cuda.is_available():
|
32 |
+
device = "cuda"
|
33 |
+
else:
|
34 |
+
print("cuda not available, using cpu")
|
35 |
+
device = "cpu"
|
36 |
+
|
37 |
+
ffmpeg_path = os.getenv('FFMPEG_PATH')
|
38 |
+
if ffmpeg_path is None:
|
39 |
+
print("please download ffmpeg-static and export to FFMPEG_PATH. \nFor example: export FFMPEG_PATH=./ffmpeg-4.4-amd64-static")
|
40 |
+
elif ffmpeg_path not in os.getenv('PATH'):
|
41 |
+
print("add ffmpeg to path")
|
42 |
+
os.environ["PATH"] = f"{ffmpeg_path}:{os.environ['PATH']}"
|
43 |
+
|
44 |
+
# Initialize the model
|
45 |
+
def initialize_model():
|
46 |
+
config = load_config('./configs/prompts/infer.yaml')
|
47 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
48 |
+
return config, device
|
49 |
+
|
50 |
+
# Create the inference function
|
51 |
+
def generate_animation(audio_file, reference_image, config, device):
|
52 |
+
try:
|
53 |
+
# Run inference
|
54 |
+
output_video = inference_pipeline(
|
55 |
+
audio_path=audio_file.name,
|
56 |
+
reference_image_path=reference_image.name,
|
57 |
+
config=config,
|
58 |
+
device=device
|
59 |
+
)
|
60 |
+
return output_video
|
61 |
+
except Exception as e:
|
62 |
+
return str(e)
|
63 |
+
|
64 |
+
# Initialize the model
|
65 |
+
config, device = initialize_model()
|
66 |
+
|
67 |
+
# Create the Gradio interface
|
68 |
+
with gr.Blocks(title="EchoMimicV2: Audio-Driven Human Animation") as demo:
|
69 |
+
gr.Markdown("""
|
70 |
+
# EchoMimicV2: Audio-Driven Human Animation
|
71 |
+
Upload a reference image and audio file to generate an animated video.
|
72 |
+
""")
|
73 |
+
|
74 |
+
with gr.Row():
|
75 |
+
with gr.Column():
|
76 |
+
audio_input = gr.Audio(
|
77 |
+
label="Upload Audio",
|
78 |
+
type="filepath"
|
79 |
+
)
|
80 |
+
image_input = gr.Image(
|
81 |
+
label="Upload Reference Image",
|
82 |
+
type="filepath"
|
83 |
+
)
|
84 |
+
generate_btn = gr.Button("Generate Animation")
|
85 |
+
|
86 |
+
with gr.Column():
|
87 |
+
output_video = gr.Video(label="Generated Animation")
|
88 |
+
|
89 |
+
generate_btn.click(
|
90 |
+
fn=generate_animation,
|
91 |
+
inputs=[audio_input, image_input, gr.State(config), gr.State(device)],
|
92 |
+
outputs=output_video
|
93 |
+
)
|
94 |
+
|
95 |
+
# Launch the app
|
96 |
+
demo.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers>=4.46.3
|
2 |
+
diffusers==0.31.0
|
3 |
+
torchmetrics
|
4 |
+
torchtyping
|
5 |
+
tqdm
|
6 |
+
einops==0.8.0
|
7 |
+
omegaconf==2.3.0
|
8 |
+
opencv-python
|
9 |
+
av==13.1.0
|
10 |
+
gradio
|
11 |
+
accelerate==1.1.1
|
12 |
+
clip @ https://github.com/openai/CLIP/archive/d50d76daa670286dd6cacf3bcd80b5e4823fc8e1.zip#sha256=b5842c25da441d6c581b53a5c60e0c2127ebafe0f746f8e15561a006c6c3be6a
|
13 |
+
decord==0.6.0
|
14 |
+
gradio_client==1.4.3
|
15 |
+
imageio==2.36.0
|
16 |
+
imageio-ffmpeg==0.5.1
|
17 |
+
numpy==1.26.4
|
18 |
+
onnxruntime-gpu==1.20.1
|
19 |
+
open-clip-torch==2.29.0
|
20 |
+
opencv-contrib-python==4.10.0.84
|
21 |
+
Pillow<10.3.0,>=10.2.0
|
22 |
+
scikit-image==0.24.0
|
23 |
+
scikit-learn==1.5.2
|
24 |
+
scipy==1.14.1
|
25 |
+
torchdiffeq==0.2.5
|
26 |
+
torchsde==0.2.6
|
27 |
+
mlflow==2.18.0
|
28 |
+
controlnet-aux==0.0.9
|
29 |
+
ffmpeg-python
|
30 |
+
soundfile
|
31 |
+
mediapipe
|
32 |
+
IPython
|
33 |
+
scenedetect
|
34 |
+
moviepy==1.0.3
|
35 |
+
huggingface_hub==0.26.2
|
36 |
+
mediapipe
|