Spaces:

arsh121
/

echomimic-v2

Runtime error

App Files Files Community

arsh121 commited on May 16

Commit

f5034fe

verified ·

1 Parent(s): 254a76a

Upload 4 files

Browse files

Files changed (4) hide show

.gitignore +40 -0
README.md +58 -13
app.py +96 -0
requirements.txt +36 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,40 @@

+# Dataset and assets
+EMTD_dataset/
+assets/
+outputs/
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Model files (these will be downloaded by the Space)
+pretrained_weights/
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+# OS
+.DS_Store
+Thumbs.db

README.md CHANGED Viewed

@@ -1,13 +1,58 @@
----
-title: Echomimic V2
-emoji: 🦀
-colorFrom: yellow
-colorTo: yellow
-sdk: gradio
-sdk_version: 5.29.1
-app_file: app.py
-pinned: false
-short_description: chomimic-v2
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: "EchoMimicV2: Audio-Driven Human Animation"
+emoji: "🎬"
+colorFrom: "blue"
+colorTo: "purple"
+sdk: "gradio"
+sdk_version: "4.19.2"
+app_file: "app.py"
+pinned: false
+---
+# EchoMimicV2: Audio-Driven Human Animation
+This Space provides a web interface for EchoMimicV2, an AI model that generates human animations from audio input and a reference image.
+## How to Use
+1. Upload an audio file (WAV format recommended)
+2. Upload a reference image of a person
+3. Click "Generate Animation" to create the video
+4. Wait for the processing to complete
+5. Download the generated video
+## Features
+- Audio-driven human animation
+- Support for both English and Chinese audio
+- High-quality video generation
+- Realistic facial expressions and body movements
+## Model Information
+This Space uses the EchoMimicV2 model from [BadToBest/EchoMimicV2](https://huggingface.co/BadToBest/EchoMimicV2).
+## Requirements
+- Audio file (WAV format recommended)
+- Reference image of a person (clear face visible)
+- Processing time varies based on input length
+## Limitations
+- Best results with clear audio and front-facing reference images
+- Processing time depends on video length
+- GPU memory requirements for optimal performance
+## Citation
+If you use this model, please cite:
+```
+@misc{meng2024echomimicv2,
+  title={EchoMimicV2: Towards Striking, Simplified, and Semi-Body Human Animation},
+  author={Rang Meng, Xingyu Zhang, Yuming Li, Chenguang Ma},
+  year={2024},
+  eprint={2411.10061},
+  archivePrefix={arXiv}
+}
+```

app.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import os
+import random
+from pathlib import Path
+import numpy as np
+import torch
+from diffusers import AutoencoderKL, DDIMScheduler
+from PIL import Image
+from src.models.unet_2d_condition import UNet2DConditionModel
+from src.models.unet_3d_emo import EMOUNet3DConditionModel
+from src.models.whisper.audio2feature import load_audio_model
+from src.pipelines.pipeline_echomimicv2 import EchoMimicV2Pipeline
+from src.utils.util import save_videos_grid
+from src.models.pose_encoder import PoseEncoder
+from src.utils.dwpose_util import draw_pose_select_v2
+from moviepy.editor import VideoFileClip, AudioFileClip
+import gradio as gr
+from datetime import datetime
+from torchao.quantization import quantize_, int8_weight_only
+import gc
+from src.inference import inference_pipeline
+from src.utils import load_config
+total_vram_in_gb = torch.cuda.get_device_properties(0).total_memory / 1073741824
+print(f'\033[32mCUDA版本：{torch.version.cuda}\033[0m')
+print(f'\033[32mPytorch版本：{torch.__version__}\033[0m')
+print(f'\033[32m显卡型号：{torch.cuda.get_device_name()}\033[0m')
+print(f'\033[32m显存大小：{total_vram_in_gb:.2f}GB\033[0m')
+print(f'\033[32m精度：float16\033[0m')
+dtype = torch.float16
+if torch.cuda.is_available():
+        device = "cuda"
+else:
+    print("cuda not available, using cpu")
+    device = "cpu"
+ffmpeg_path = os.getenv('FFMPEG_PATH')
+if ffmpeg_path is None:
+    print("please download ffmpeg-static and export to FFMPEG_PATH. \nFor example: export FFMPEG_PATH=./ffmpeg-4.4-amd64-static")
+elif ffmpeg_path not in os.getenv('PATH'):
+    print("add ffmpeg to path")
+    os.environ["PATH"] = f"{ffmpeg_path}:{os.environ['PATH']}"
+# Initialize the model
+def initialize_model():
+    config = load_config('./configs/prompts/infer.yaml')
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    return config, device
+# Create the inference function
+def generate_animation(audio_file, reference_image, config, device):
+    try:
+        # Run inference
+        output_video = inference_pipeline(
+            audio_path=audio_file.name,
+            reference_image_path=reference_image.name,
+            config=config,
+            device=device
+        )
+        return output_video
+    except Exception as e:
+        return str(e)
+# Initialize the model
+config, device = initialize_model()
+# Create the Gradio interface
+with gr.Blocks(title="EchoMimicV2: Audio-Driven Human Animation") as demo:
+    gr.Markdown("""
+    # EchoMimicV2: Audio-Driven Human Animation
+    Upload a reference image and audio file to generate an animated video.
+    """)
+    with gr.Row():
+        with gr.Column():
+            audio_input = gr.Audio(
+                label="Upload Audio",
+                type="filepath"
+            )
+            image_input = gr.Image(
+                label="Upload Reference Image",
+                type="filepath"
+            )
+            generate_btn = gr.Button("Generate Animation")
+        with gr.Column():
+            output_video = gr.Video(label="Generated Animation")
+    generate_btn.click(
+        fn=generate_animation,
+        inputs=[audio_input, image_input, gr.State(config), gr.State(device)],
+        outputs=output_video
+    )
+# Launch the app
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,36 @@

+transformers>=4.46.3
+diffusers==0.31.0
+torchmetrics
+torchtyping
+tqdm
+einops==0.8.0
+omegaconf==2.3.0
+opencv-python
+av==13.1.0
+gradio
+accelerate==1.1.1
+clip @ https://github.com/openai/CLIP/archive/d50d76daa670286dd6cacf3bcd80b5e4823fc8e1.zip#sha256=b5842c25da441d6c581b53a5c60e0c2127ebafe0f746f8e15561a006c6c3be6a
+decord==0.6.0
+gradio_client==1.4.3
+imageio==2.36.0
+imageio-ffmpeg==0.5.1
+numpy==1.26.4
+onnxruntime-gpu==1.20.1
+open-clip-torch==2.29.0
+opencv-contrib-python==4.10.0.84
+Pillow<10.3.0,>=10.2.0
+scikit-image==0.24.0
+scikit-learn==1.5.2
+scipy==1.14.1
+torchdiffeq==0.2.5
+torchsde==0.2.6
+mlflow==2.18.0
+controlnet-aux==0.0.9
+ffmpeg-python
+soundfile
+mediapipe
+IPython
+scenedetect
+moviepy==1.0.3
+huggingface_hub==0.26.2
+mediapipe