joonson · rb-synth · Feb 17, 2025 · Feb 17, 2025
diff --git a/.gitignore b/.gitignore
@@ -43,3 +43,6 @@ data/
 protos/
 utils/
 *.pth
+
+.vscode/
+*.egg-info/
diff --git a/download_model.sh b/download_model.sh
@@ -1,9 +1,19 @@
 # SyncNet model
 
-mkdir data
-wget http://www.robots.ox.ac.uk/~vgg/software/lipsync/data/syncnet_v2.model -O data/syncnet_v2.model
-wget http://www.robots.ox.ac.uk/~vgg/software/lipsync/data/example.avi -O data/example.avi
+# check SYNCNET_MODEL_DIR is set
+if [ -z ${SYNCNET_MODEL_DIR+x} ]; then
+    echo "SYNCNET_MODEL_DIR is unset"
+    exit 1
+fi
 
-# For the pre-processing pipeline
-mkdir detectors/s3fd/weights
-wget https://www.robots.ox.ac.uk/~vgg/software/lipsync/data/sfd_face.pth -O detectors/s3fd/weights/sfd_face.pth
+mkdir -p ${SYNCNET_MODEL_DIR}
+
+syncnet_path=${SYNCNET_MODEL_DIR}/syncnet_v2.model
+if [ ! -f ${syncnet_path} ]; then
+    wget http://www.robots.ox.ac.uk/~vgg/software/lipsync/data/syncnet_v2.model -O $syncnet_path
+fi
+
+sfd_path=${SYNCNET_MODEL_DIR}/sfd_face.pth
+if [ ! -f ${sfd_path} ]; then
+    wget https://www.robots.ox.ac.uk/~vgg/software/lipsync/data/sfd_face.pth -O $sfd_path
+fi
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,21 @@
+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "syncnet"
+version = "0.1.0"
+license = { text = "MIT" }
+requires-python = ">=3.7"
+dependencies = [
+    "torch>=1.4.0",
+    "torchvision>=0.5.0",
+    "numpy>=1.18.1",
+    "scipy>=1.2.1",
+    "scenedetect>=0.6.5.2",
+    "opencv-contrib-python",
+    "python_speech_features",
+]
+
+[tool.setuptools.packages.find]
+where = ["src"]
diff --git a/requirements.txt b/requirements.txt
@@ -2,6 +2,6 @@ torch>=1.4.0
 torchvision>=0.5.0
 numpy>=1.18.1
 scipy>=1.2.1
-scenedetect==0.5.1
+scenedetect>=0.6.5.2
 opencv-contrib-python
 python_speech_features
diff --git a/run_syncnet.py b/run_syncnet.py
diff --git a/SyncNetInstance.py → src/syncnet/SyncNetInstance.py b/SyncNetInstance.py → src/syncnet/SyncNetInstance.py
@@ -10,14 +10,14 @@
 
 from scipy import signal
 from scipy.io import wavfile
-from SyncNetModel import *
+from syncnet.SyncNetModel import S
 from shutil import rmtree
 
 
 # ==================== Get OFFSET ====================
 
 def calc_pdist(feat1, feat2, vshift=10):
-    
+
     win_size = vshift*2+1
 
     feat2p = torch.nn.functional.pad(feat2,(0,0,vshift,vshift))
@@ -52,18 +52,18 @@ def evaluate(self, opt, videofile):
 
         os.makedirs(os.path.join(opt.tmp_dir,opt.reference))
 
-        command = ("ffmpeg -y -i %s -threads 1 -f image2 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'%06d.jpg'))) 
+        command = ("ffmpeg -y -i %s -threads 1 -f image2 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'%06d.jpg')))
         output = subprocess.call(command, shell=True, stdout=None)
 
-        command = ("ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'audio.wav'))) 
+        command = ("ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'audio.wav')))
         output = subprocess.call(command, shell=True, stdout=None)
-        
+
         # ========== ==========
-        # Load video 
+        # Load video
         # ========== ==========
 
         images = []
-        
+
         flist = glob.glob(os.path.join(opt.tmp_dir,opt.reference,'*.jpg'))
         flist.sort()
 
@@ -95,7 +95,7 @@ def evaluate(self, opt, videofile):
             print("WARNING: Audio (%.4fs) and video (%.4fs) lengths are different."%(float(len(audio))/16000,float(len(images))/25))
 
         min_length = min(len(images),math.floor(len(audio)/640))
-        
+
         # ========== ==========
         # Generate video and audio feats
         # ========== ==========
@@ -106,7 +106,7 @@ def evaluate(self, opt, videofile):
 
         tS = time.time()
         for i in range(0,lastframe,opt.batch_size):
-            
+
             im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
             im_in = torch.cat(im_batch,0)
             im_out  = self.__S__.forward_lip(im_in.cuda());
@@ -123,7 +123,7 @@ def evaluate(self, opt, videofile):
         # ========== ==========
         # Compute offset
         # ========== ==========
-            
+
         print('Compute time %.3f sec.' % (time.time()-tS))
 
         dists = calc_pdist(im_feat,cc_feat,vshift=opt.vshift)
@@ -138,7 +138,7 @@ def evaluate(self, opt, videofile):
         # fdist   = numpy.pad(fdist, (3,3), 'constant', constant_values=15)
         fconf   = torch.median(mdist).numpy() - fdist
         fconfm  = signal.medfilt(fconf,kernel_size=9)
-        
+
         numpy.set_printoptions(formatter={'float': '{: 0.3f}'.format})
         print('Framewise conf: ')
         print(fconfm)
@@ -150,9 +150,9 @@ def evaluate(self, opt, videofile):
     def extract_feature(self, opt, videofile):
 
         self.__S__.eval();
-        
+
         # ========== ==========
-        # Load video 
+        # Load video
         # ========== ==========
         cap = cv2.VideoCapture(videofile)
 
@@ -171,7 +171,7 @@ def extract_feature(self, opt, videofile):
         im = numpy.transpose(im,(0,3,4,1,2))
 
         imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
-        
+
         # ========== ==========
         # Generate video feats
         # ========== ==========
@@ -181,7 +181,7 @@ def extract_feature(self, opt, videofile):
 
         tS = time.time()
         for i in range(0,lastframe,opt.batch_size):
-            
+
             im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
             im_in = torch.cat(im_batch,0)
             im_out  = self.__S__.forward_lipfeat(im_in.cuda());
@@ -192,7 +192,7 @@ def extract_feature(self, opt, videofile):
         # ========== ==========
         # Compute offset
         # ========== ==========
-            
+
         print('Compute time %.3f sec.' % (time.time()-tS))
 
         return im_feat

diff --git a/SyncNetModel.py → src/syncnet/SyncNetModel.py b/SyncNetModel.py → src/syncnet/SyncNetModel.py
diff --git a/detectors/README.md → src/syncnet/detectors/README.md b/detectors/README.md → src/syncnet/detectors/README.md
diff --git a/detectors/__init__.py → src/syncnet/detectors/__init__.py b/detectors/__init__.py → src/syncnet/detectors/__init__.py
diff --git a/detectors/s3fd/__init__.py → src/syncnet/detectors/s3fd/__init__.py b/detectors/s3fd/__init__.py → src/syncnet/detectors/s3fd/__init__.py
@@ -2,11 +2,11 @@
 import numpy as np
 import cv2
 import torch
-from torchvision import transforms
 from .nets import S3FDNet
 from .box_utils import nms_
+import os
 
-PATH_WEIGHT = './detectors/s3fd/weights/sfd_face.pth'
+PATH_WEIGHT = os.path.join(os.environ["SYNCNET_MODEL_DIR"], "sfd_face.pth")
 img_mean = np.array([104., 117., 123.])[:, np.newaxis, np.newaxis].astype('float32')
 
 
@@ -23,7 +23,7 @@ def __init__(self, device='cuda'):
         self.net.load_state_dict(state_dict)
         self.net.eval()
         print('[S3FD] finished loading (%.4f sec)' % (time.time() - tstamp))
-    
+
     def detect_faces(self, image, conf_th=0.8, scales=[1]):
 
         w, h = image.shape[1], image.shape[0]

diff --git a/detectors/s3fd/box_utils.py → src/syncnet/detectors/s3fd/box_utils.py b/detectors/s3fd/box_utils.py → src/syncnet/detectors/s3fd/box_utils.py
@@ -35,7 +35,7 @@ def nms_(dets, thresh):
         inds = np.where(ovr <= thresh)[0]
         order = order[inds + 1]
 
-    return np.array(keep).astype(np.int)
+    return np.array(keep).astype(np.int32)
 
 
 def decode(loc, priors, variances):

diff --git a/detectors/s3fd/nets.py → src/syncnet/detectors/s3fd/nets.py b/detectors/s3fd/nets.py → src/syncnet/detectors/s3fd/nets.py
diff --git a/src/syncnet/run_all.py b/src/syncnet/run_all.py
@@ -0,0 +1,45 @@
+import os
+import argparse
+from syncnet.run_pipeline import run_pipeline
+from syncnet.run_syncnet import run_syncnet
+from tempfile import TemporaryDirectory
+from pathlib import Path
+
+
+def run_all(video_path: str) -> float:
+    syncnet_model_path = Path(os.environ["SYNCNET_MODEL_DIR"]) / "syncnet_v2.model"
+    assert syncnet_model_path.exists()
+
+    with TemporaryDirectory() as tmp_dir:
+        pipeline_opts = argparse.Namespace(
+            data_dir=tmp_dir,
+            videofile=video_path,
+            reference="",
+            facedet_scale=0.25,
+            crop_scale=0.4,
+            min_track=100,
+            frame_rate=25,
+            num_failed_det=25,
+            min_face_size=100,
+        )
+        run_pipeline(pipeline_opts)
+
+        processed_video_path = Path(tmp_dir) / "pycrop" / "00000.avi"
+        assert processed_video_path.exists()
+
+        syncnet_opts = argparse.Namespace(
+            initial_model=os.path.join(os.environ["SYNCNET_MODEL_DIR"], "syncnet_v2.model"),
+            batch_size=20,
+            vshift=1,
+            data_dir=tmp_dir,
+            videofile=str(processed_video_path),
+            reference="",
+        )
+        return run_syncnet(syncnet_opts)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description = "SyncNet")
+    parser.add_argument("--video_path", type=str, required=True, help="Path to the video file")
+    args = parser.parse_args()
+    result = run_all(**vars(args))
-Original file line number
+Diff line change
@@ Expand Up / @@ -43,3 +43,6 @@ data/ @@
     protos/
     utils/
     *.pth
+    .vscode/
+    *.egg-info/