Skip to content

Commit 28b8dfc

Browse files
author
shadowcun
committed
v0.0.2rc: see release note
2 parents 1119361 + b746106 commit 28b8dfc

File tree

6 files changed

+256
-7
lines changed

6 files changed

+256
-7
lines changed

.gitignore

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ cython_debug/
157157
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158158
# and can be added to the global gitignore or merged into this file. For a more nuclear
159159
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
160-
#.idea/
160+
.idea/
161161

162162
examples/results/*
163163
gfpgan/*
@@ -166,4 +166,7 @@ assets/*
166166
results/*
167167
Dockerfile
168168
start_docker.sh
169-
start.sh
169+
start.sh
170+
171+
# Mac
172+
.DS_Store

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,7 @@
55

66
<!--<h2> 😭 SadTalker: <span style="font-size:12px">Learning Realistic 3D Motion Coefficients for Stylized Audio-Driven Single Image Talking Face Animation </span> </h2> -->
77

8-
<a href='https://arxiv.org/abs/2211.12194'><img src='https://img.shields.io/badge/ArXiv-PDF-red'></a> &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<a href='https://sadtalker.github.io'><img src='https://img.shields.io/badge/Project-Page-Green'></a> &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Winfredy/SadTalker/blob/main/quick_demo.ipynb) &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/vinthony/SadTalker) &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; [![sd webui-colab](https://img.shields.io/badge/Automatic1111-Colab-green)](https://colab.research.google.com/github/camenduru/stable-diffusion-webui-colab/blob/main/video/stable/stable_diffusion_1_5_video_webui_colab.ipynb)
9-
8+
<a href='https://arxiv.org/abs/2211.12194'><img src='https://img.shields.io/badge/ArXiv-PDF-red'></a> &nbsp; <a href='https://sadtalker.github.io'><img src='https://img.shields.io/badge/Project-Page-Green'></a> &nbsp; [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Winfredy/SadTalker/blob/main/quick_demo.ipynb) &nbsp; [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/vinthony/SadTalker) &nbsp; [![sd webui-colab](https://img.shields.io/badge/Automatic1111-Colab-green)](https://colab.research.google.com/github/camenduru/stable-diffusion-webui-colab/blob/main/video/stable/stable_diffusion_1_5_video_webui_colab.ipynb) &nbsp; [![Replicate](https://replicate.com/cjwbw/sadtalker/badge)](https://replicate.com/cjwbw/sadtalker)
109

1110
<div>
1211
<a target='_blank'>Wenxuan Zhang <sup>*,1,2</sup> </a>&emsp;
@@ -121,9 +120,10 @@ Tutorials from communities: [中文windows教程](https://www.bilibili.com/video
121120
### Windows ([中文windows教程](https://www.bilibili.com/video/BV1Dc411W7V6/)):
122121

123122
1. Install [Python 3.10.6](https://www.python.org/downloads/windows/), checking "Add Python to PATH".
124-
2. Install [git](https://git-scm.com/download/win).
125-
3. Install `ffmpeg`, following [this instruction](https://www.wikihow.com/Install-FFmpeg-on-Windows).
123+
2. Install [git](https://git-scm.com/download/win) manually (OR `scoop install git` via [scoop](https://scoop.sh/)).
124+
3. Install `ffmpeg`, following [this instruction](https://www.wikihow.com/Install-FFmpeg-on-Windows) (OR using `scoop install ffmpeg` via [scoop](https://scoop.sh/)).
126125
4. Download our SadTalker repository, for example by running `git clone https://github.com/Winfredy/SadTalker.git`.
126+
5. Download the `checkpoint` and `gfpgan` [below↓](https://github.com/Winfredy/SadTalker#-2-download-trained-models).
127127
5. Run `start.bat` from Windows Explorer as normal, non-administrator, user, a gradio WebUI demo will be started.
128128

129129
### Macbook:

cog.yaml

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
build:
2+
gpu: true
3+
cuda: "11.3"
4+
python_version: "3.8"
5+
system_packages:
6+
- "ffmpeg"
7+
- "libgl1-mesa-glx"
8+
- "libglib2.0-0"
9+
python_packages:
10+
- "torch==1.12.1"
11+
- "torchvision==0.13.1"
12+
- "torchaudio==0.12.1"
13+
- "joblib==1.1.0"
14+
- "scikit-image==0.19.3"
15+
- "basicsr==1.4.2"
16+
- "facexlib==0.3.0"
17+
- "resampy==0.3.1"
18+
- "pydub==0.25.1"
19+
- "scipy==1.10.1"
20+
- "kornia==0.6.8"
21+
- "face_alignment==1.3.5"
22+
- "imageio==2.19.3"
23+
- "imageio-ffmpeg==0.4.7"
24+
- "librosa==0.9.2" #
25+
- "tqdm==4.65.0"
26+
- "yacs==0.1.8"
27+
- "gfpgan==1.3.8"
28+
- "dlib-bin==19.24.1"
29+
- "av==10.0.0"
30+
- "trimesh==3.9.20"
31+
run:
32+
- mkdir -p /root/.cache/torch/hub/checkpoints/ && wget --output-document "/root/.cache/torch/hub/checkpoints/s3fd-619a316812.pth" "https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth"
33+
- mkdir -p /root/.cache/torch/hub/checkpoints/ && wget --output-document "/root/.cache/torch/hub/checkpoints/2DFAN4-cd938726ad.zip" "https://www.adrianbulat.com/downloads/python-fan/2DFAN4-cd938726ad.zip"
34+
35+
predict: "predict.py:Predictor"

docs/FAQ.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,21 @@ Make sure you have downloaded the checkpoints and gfpgan as [here](https://githu
2626
**Q: RuntimeError: unexpected EOF, expected 237192 more bytes. The file might be corrupted.**
2727

2828
The files are not automatically downloaded. Please update the code and download the gfpgan folders as [here](https://github.com/Winfredy/SadTalker#-2-download-trained-models).
29+
30+
**Q: CUDA out of memory error**
31+
32+
please refer to https://stackoverflow.com/questions/73747731/runtimeerror-cuda-out-of-memory-how-setting-max-split-size-mb
33+
34+
```
35+
# windows
36+
set PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128
37+
python inference.py ...
38+
39+
# linux
40+
export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128
41+
python inference.py ...
42+
```
43+
44+
**Q: Error while decoding stream #0:0: Invalid data found when processing input [mp3float @ 0000015037628c00] Header missing**
45+
46+
Our method only support wav or mp3 files as input, please make sure the feeded audios are in these formats.

predict.py

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
"""run bash scripts/download_models.sh first to prepare the weights file"""
2+
import os
3+
import shutil
4+
from argparse import Namespace
5+
from src.utils.preprocess import CropAndExtract
6+
from src.test_audio2coeff import Audio2Coeff
7+
from src.facerender.animate import AnimateFromCoeff
8+
from src.generate_batch import get_data
9+
from src.generate_facerender_batch import get_facerender_data
10+
from src.utils.init_path import init_path
11+
from cog import BasePredictor, Input, Path
12+
13+
checkpoints = "checkpoints"
14+
15+
16+
class Predictor(BasePredictor):
17+
def setup(self):
18+
"""Load the model into memory to make running multiple predictions efficient"""
19+
device = "cuda"
20+
21+
22+
sadtalker_paths = init_path(checkpoints,os.path.join("src","config"))
23+
24+
# init model
25+
self.preprocess_model = CropAndExtract(sadtalker_paths, device
26+
)
27+
28+
self.audio_to_coeff = Audio2Coeff(
29+
sadtalker_paths,
30+
device,
31+
)
32+
33+
self.animate_from_coeff = {
34+
"full": AnimateFromCoeff(
35+
sadtalker_paths,
36+
device,
37+
),
38+
"others": AnimateFromCoeff(
39+
sadtalker_paths,
40+
device,
41+
),
42+
}
43+
44+
def predict(
45+
self,
46+
source_image: Path = Input(
47+
description="Upload the source image, it can be video.mp4 or picture.png",
48+
),
49+
driven_audio: Path = Input(
50+
description="Upload the driven audio, accepts .wav and .mp4 file",
51+
),
52+
enhancer: str = Input(
53+
description="Choose a face enhancer",
54+
choices=["gfpgan", "RestoreFormer"],
55+
default="gfpgan",
56+
),
57+
preprocess: str = Input(
58+
description="how to preprocess the images",
59+
choices=["crop", "resize", "full"],
60+
default="full",
61+
),
62+
ref_eyeblink: Path = Input(
63+
description="path to reference video providing eye blinking",
64+
default=None,
65+
),
66+
ref_pose: Path = Input(
67+
description="path to reference video providing pose",
68+
default=None,
69+
),
70+
still: bool = Input(
71+
description="can crop back to the original videos for the full body aniamtion when preprocess is full",
72+
default=True,
73+
),
74+
) -> Path:
75+
"""Run a single prediction on the model"""
76+
77+
animate_from_coeff = (
78+
self.animate_from_coeff["full"]
79+
if preprocess == "full"
80+
else self.animate_from_coeff["others"]
81+
)
82+
83+
args = load_default()
84+
args.pic_path = str(source_image)
85+
args.audio_path = str(driven_audio)
86+
device = "cuda"
87+
args.still = still
88+
args.ref_eyeblink = None if ref_eyeblink is None else str(ref_eyeblink)
89+
args.ref_pose = None if ref_pose is None else str(ref_pose)
90+
91+
# crop image and extract 3dmm from image
92+
results_dir = "results"
93+
if os.path.exists(results_dir):
94+
shutil.rmtree(results_dir)
95+
os.makedirs(results_dir)
96+
first_frame_dir = os.path.join(results_dir, "first_frame_dir")
97+
os.makedirs(first_frame_dir)
98+
99+
print("3DMM Extraction for source image")
100+
first_coeff_path, crop_pic_path, crop_info = self.preprocess_model.generate(
101+
args.pic_path, first_frame_dir, preprocess, source_image_flag=True
102+
)
103+
if first_coeff_path is None:
104+
print("Can't get the coeffs of the input")
105+
return
106+
107+
if ref_eyeblink is not None:
108+
ref_eyeblink_videoname = os.path.splitext(os.path.split(ref_eyeblink)[-1])[
109+
0
110+
]
111+
ref_eyeblink_frame_dir = os.path.join(results_dir, ref_eyeblink_videoname)
112+
os.makedirs(ref_eyeblink_frame_dir, exist_ok=True)
113+
print("3DMM Extraction for the reference video providing eye blinking")
114+
ref_eyeblink_coeff_path, _, _ = self.preprocess_model.generate(
115+
ref_eyeblink, ref_eyeblink_frame_dir
116+
)
117+
else:
118+
ref_eyeblink_coeff_path = None
119+
120+
if ref_pose is not None:
121+
if ref_pose == ref_eyeblink:
122+
ref_pose_coeff_path = ref_eyeblink_coeff_path
123+
else:
124+
ref_pose_videoname = os.path.splitext(os.path.split(ref_pose)[-1])[0]
125+
ref_pose_frame_dir = os.path.join(results_dir, ref_pose_videoname)
126+
os.makedirs(ref_pose_frame_dir, exist_ok=True)
127+
print("3DMM Extraction for the reference video providing pose")
128+
ref_pose_coeff_path, _, _ = self.preprocess_model.generate(
129+
ref_pose, ref_pose_frame_dir
130+
)
131+
else:
132+
ref_pose_coeff_path = None
133+
134+
# audio2ceoff
135+
batch = get_data(
136+
first_coeff_path,
137+
args.audio_path,
138+
device,
139+
ref_eyeblink_coeff_path,
140+
still=still,
141+
)
142+
coeff_path = self.audio_to_coeff.generate(
143+
batch, results_dir, args.pose_style, ref_pose_coeff_path
144+
)
145+
# coeff2video
146+
print("coeff2video")
147+
data = get_facerender_data(
148+
coeff_path,
149+
crop_pic_path,
150+
first_coeff_path,
151+
args.audio_path,
152+
args.batch_size,
153+
args.input_yaw,
154+
args.input_pitch,
155+
args.input_roll,
156+
expression_scale=args.expression_scale,
157+
still_mode=still,
158+
preprocess=preprocess,
159+
)
160+
animate_from_coeff.generate(
161+
data, results_dir, args.pic_path, crop_info,
162+
enhancer=enhancer, background_enhancer=args.background_enhancer,
163+
preprocess=preprocess)
164+
165+
output = "/tmp/out.mp4"
166+
mp4_path = os.path.join(results_dir, [f for f in os.listdir(results_dir) if "enhanced.mp4" in f][0])
167+
shutil.copy(mp4_path, output)
168+
169+
return Path(output)
170+
171+
172+
def load_default():
173+
return Namespace(
174+
pose_style=0,
175+
batch_size=2,
176+
expression_scale=1.0,
177+
input_yaw=None,
178+
input_pitch=None,
179+
input_roll=None,
180+
background_enhancer=None,
181+
face3dvis=False,
182+
net_recon="resnet50",
183+
init_path=None,
184+
use_last_fc=False,
185+
bfm_folder="./src/config/",
186+
bfm_model="BFM_model_front.mat",
187+
focal=1015.0,
188+
center=112.0,
189+
camera_d=10.0,
190+
z_near=5.0,
191+
z_far=15.0,
192+
)

src/facerender/animate.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,8 @@ def generate(self, x, video_save_dir, pic_path, crop_info, enhancer=None, backgr
206206
audio_name = os.path.splitext(os.path.split(audio_path)[-1])[0]
207207
new_audio_path = os.path.join(video_save_dir, audio_name+'.wav')
208208
start_time = 0
209-
sound = AudioSegment.from_mp3(audio_path)
209+
# cog will not keep the .mp3 filename
210+
sound = AudioSegment.from_file(audio_path)
210211
frames = frame_num
211212
end_time = start_time + frames*1/25*1000
212213
word1=sound.set_frame_rate(16000)

0 commit comments

Comments
 (0)