arsh121 commited on
Commit
f5034fe
·
verified ·
1 Parent(s): 254a76a

Upload 4 files

Browse files
Files changed (4) hide show
  1. .gitignore +40 -0
  2. README.md +58 -13
  3. app.py +96 -0
  4. requirements.txt +36 -0
.gitignore ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset and assets
2
+ EMTD_dataset/
3
+ assets/
4
+ outputs/
5
+
6
+ # Python
7
+ __pycache__/
8
+ *.py[cod]
9
+ *$py.class
10
+ *.so
11
+ .Python
12
+ env/
13
+ build/
14
+ develop-eggs/
15
+ dist/
16
+ downloads/
17
+ eggs/
18
+ .eggs/
19
+ lib/
20
+ lib64/
21
+ parts/
22
+ sdist/
23
+ var/
24
+ wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+
29
+ # Model files (these will be downloaded by the Space)
30
+ pretrained_weights/
31
+
32
+ # IDE
33
+ .idea/
34
+ .vscode/
35
+ *.swp
36
+ *.swo
37
+
38
+ # OS
39
+ .DS_Store
40
+ Thumbs.db
README.md CHANGED
@@ -1,13 +1,58 @@
1
- ---
2
- title: Echomimic V2
3
- emoji: 🦀
4
- colorFrom: yellow
5
- colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 5.29.1
8
- app_file: app.py
9
- pinned: false
10
- short_description: chomimic-v2
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: "EchoMimicV2: Audio-Driven Human Animation"
3
+ emoji: "🎬"
4
+ colorFrom: "blue"
5
+ colorTo: "purple"
6
+ sdk: "gradio"
7
+ sdk_version: "4.19.2"
8
+ app_file: "app.py"
9
+ pinned: false
10
+ ---
11
+
12
+ # EchoMimicV2: Audio-Driven Human Animation
13
+
14
+ This Space provides a web interface for EchoMimicV2, an AI model that generates human animations from audio input and a reference image.
15
+
16
+ ## How to Use
17
+
18
+ 1. Upload an audio file (WAV format recommended)
19
+ 2. Upload a reference image of a person
20
+ 3. Click "Generate Animation" to create the video
21
+ 4. Wait for the processing to complete
22
+ 5. Download the generated video
23
+
24
+ ## Features
25
+
26
+ - Audio-driven human animation
27
+ - Support for both English and Chinese audio
28
+ - High-quality video generation
29
+ - Realistic facial expressions and body movements
30
+
31
+ ## Model Information
32
+
33
+ This Space uses the EchoMimicV2 model from [BadToBest/EchoMimicV2](https://huggingface.co/BadToBest/EchoMimicV2).
34
+
35
+ ## Requirements
36
+
37
+ - Audio file (WAV format recommended)
38
+ - Reference image of a person (clear face visible)
39
+ - Processing time varies based on input length
40
+
41
+ ## Limitations
42
+
43
+ - Best results with clear audio and front-facing reference images
44
+ - Processing time depends on video length
45
+ - GPU memory requirements for optimal performance
46
+
47
+ ## Citation
48
+
49
+ If you use this model, please cite:
50
+ ```
51
+ @misc{meng2024echomimicv2,
52
+ title={EchoMimicV2: Towards Striking, Simplified, and Semi-Body Human Animation},
53
+ author={Rang Meng, Xingyu Zhang, Yuming Li, Chenguang Ma},
54
+ year={2024},
55
+ eprint={2411.10061},
56
+ archivePrefix={arXiv}
57
+ }
58
+ ```
app.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+ from pathlib import Path
4
+ import numpy as np
5
+ import torch
6
+ from diffusers import AutoencoderKL, DDIMScheduler
7
+ from PIL import Image
8
+ from src.models.unet_2d_condition import UNet2DConditionModel
9
+ from src.models.unet_3d_emo import EMOUNet3DConditionModel
10
+ from src.models.whisper.audio2feature import load_audio_model
11
+ from src.pipelines.pipeline_echomimicv2 import EchoMimicV2Pipeline
12
+ from src.utils.util import save_videos_grid
13
+ from src.models.pose_encoder import PoseEncoder
14
+ from src.utils.dwpose_util import draw_pose_select_v2
15
+ from moviepy.editor import VideoFileClip, AudioFileClip
16
+
17
+ import gradio as gr
18
+ from datetime import datetime
19
+ from torchao.quantization import quantize_, int8_weight_only
20
+ import gc
21
+ from src.inference import inference_pipeline
22
+ from src.utils import load_config
23
+
24
+ total_vram_in_gb = torch.cuda.get_device_properties(0).total_memory / 1073741824
25
+ print(f'\033[32mCUDA版本:{torch.version.cuda}\033[0m')
26
+ print(f'\033[32mPytorch版本:{torch.__version__}\033[0m')
27
+ print(f'\033[32m显卡型号:{torch.cuda.get_device_name()}\033[0m')
28
+ print(f'\033[32m显存大小:{total_vram_in_gb:.2f}GB\033[0m')
29
+ print(f'\033[32m精度:float16\033[0m')
30
+ dtype = torch.float16
31
+ if torch.cuda.is_available():
32
+ device = "cuda"
33
+ else:
34
+ print("cuda not available, using cpu")
35
+ device = "cpu"
36
+
37
+ ffmpeg_path = os.getenv('FFMPEG_PATH')
38
+ if ffmpeg_path is None:
39
+ print("please download ffmpeg-static and export to FFMPEG_PATH. \nFor example: export FFMPEG_PATH=./ffmpeg-4.4-amd64-static")
40
+ elif ffmpeg_path not in os.getenv('PATH'):
41
+ print("add ffmpeg to path")
42
+ os.environ["PATH"] = f"{ffmpeg_path}:{os.environ['PATH']}"
43
+
44
+ # Initialize the model
45
+ def initialize_model():
46
+ config = load_config('./configs/prompts/infer.yaml')
47
+ device = "cuda" if torch.cuda.is_available() else "cpu"
48
+ return config, device
49
+
50
+ # Create the inference function
51
+ def generate_animation(audio_file, reference_image, config, device):
52
+ try:
53
+ # Run inference
54
+ output_video = inference_pipeline(
55
+ audio_path=audio_file.name,
56
+ reference_image_path=reference_image.name,
57
+ config=config,
58
+ device=device
59
+ )
60
+ return output_video
61
+ except Exception as e:
62
+ return str(e)
63
+
64
+ # Initialize the model
65
+ config, device = initialize_model()
66
+
67
+ # Create the Gradio interface
68
+ with gr.Blocks(title="EchoMimicV2: Audio-Driven Human Animation") as demo:
69
+ gr.Markdown("""
70
+ # EchoMimicV2: Audio-Driven Human Animation
71
+ Upload a reference image and audio file to generate an animated video.
72
+ """)
73
+
74
+ with gr.Row():
75
+ with gr.Column():
76
+ audio_input = gr.Audio(
77
+ label="Upload Audio",
78
+ type="filepath"
79
+ )
80
+ image_input = gr.Image(
81
+ label="Upload Reference Image",
82
+ type="filepath"
83
+ )
84
+ generate_btn = gr.Button("Generate Animation")
85
+
86
+ with gr.Column():
87
+ output_video = gr.Video(label="Generated Animation")
88
+
89
+ generate_btn.click(
90
+ fn=generate_animation,
91
+ inputs=[audio_input, image_input, gr.State(config), gr.State(device)],
92
+ outputs=output_video
93
+ )
94
+
95
+ # Launch the app
96
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ transformers>=4.46.3
2
+ diffusers==0.31.0
3
+ torchmetrics
4
+ torchtyping
5
+ tqdm
6
+ einops==0.8.0
7
+ omegaconf==2.3.0
8
+ opencv-python
9
+ av==13.1.0
10
+ gradio
11
+ accelerate==1.1.1
12
+ clip @ https://github.com/openai/CLIP/archive/d50d76daa670286dd6cacf3bcd80b5e4823fc8e1.zip#sha256=b5842c25da441d6c581b53a5c60e0c2127ebafe0f746f8e15561a006c6c3be6a
13
+ decord==0.6.0
14
+ gradio_client==1.4.3
15
+ imageio==2.36.0
16
+ imageio-ffmpeg==0.5.1
17
+ numpy==1.26.4
18
+ onnxruntime-gpu==1.20.1
19
+ open-clip-torch==2.29.0
20
+ opencv-contrib-python==4.10.0.84
21
+ Pillow<10.3.0,>=10.2.0
22
+ scikit-image==0.24.0
23
+ scikit-learn==1.5.2
24
+ scipy==1.14.1
25
+ torchdiffeq==0.2.5
26
+ torchsde==0.2.6
27
+ mlflow==2.18.0
28
+ controlnet-aux==0.0.9
29
+ ffmpeg-python
30
+ soundfile
31
+ mediapipe
32
+ IPython
33
+ scenedetect
34
+ moviepy==1.0.3
35
+ huggingface_hub==0.26.2
36
+ mediapipe