diff --git a/.gitignore b/.gitignore index 350ada0..37d2e32 100644 --- a/.gitignore +++ b/.gitignore @@ -43,3 +43,6 @@ data/ protos/ utils/ *.pth + +.vscode/ +*.egg-info/ diff --git a/download_model.sh b/download_model.sh index 3e3a9dc..9c534d9 100755 --- a/download_model.sh +++ b/download_model.sh @@ -1,9 +1,19 @@ # SyncNet model -mkdir data -wget http://www.robots.ox.ac.uk/~vgg/software/lipsync/data/syncnet_v2.model -O data/syncnet_v2.model -wget http://www.robots.ox.ac.uk/~vgg/software/lipsync/data/example.avi -O data/example.avi +# check SYNCNET_MODEL_DIR is set +if [ -z ${SYNCNET_MODEL_DIR+x} ]; then + echo "SYNCNET_MODEL_DIR is unset" + exit 1 +fi -# For the pre-processing pipeline -mkdir detectors/s3fd/weights -wget https://www.robots.ox.ac.uk/~vgg/software/lipsync/data/sfd_face.pth -O detectors/s3fd/weights/sfd_face.pth \ No newline at end of file +mkdir -p ${SYNCNET_MODEL_DIR} + +syncnet_path=${SYNCNET_MODEL_DIR}/syncnet_v2.model +if [ ! -f ${syncnet_path} ]; then + wget http://www.robots.ox.ac.uk/~vgg/software/lipsync/data/syncnet_v2.model -O $syncnet_path +fi + +sfd_path=${SYNCNET_MODEL_DIR}/sfd_face.pth +if [ ! -f ${sfd_path} ]; then + wget https://www.robots.ox.ac.uk/~vgg/software/lipsync/data/sfd_face.pth -O $sfd_path +fi \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..8b9bc85 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,21 @@ +[build-system] +requires = ["setuptools", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "syncnet" +version = "0.1.0" +license = { text = "MIT" } +requires-python = ">=3.7" +dependencies = [ + "torch>=1.4.0", + "torchvision>=0.5.0", + "numpy>=1.18.1", + "scipy>=1.2.1", + "scenedetect>=0.6.5.2", + "opencv-contrib-python", + "python_speech_features", +] + +[tool.setuptools.packages.find] +where = ["src"] diff --git a/requirements.txt b/requirements.txt index 8919740..b215e78 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,6 @@ torch>=1.4.0 torchvision>=0.5.0 numpy>=1.18.1 scipy>=1.2.1 -scenedetect==0.5.1 +scenedetect>=0.6.5.2 opencv-contrib-python python_speech_features diff --git a/run_syncnet.py b/run_syncnet.py deleted file mode 100755 index 45099fd..0000000 --- a/run_syncnet.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/python -#-*- coding: utf-8 -*- - -import time, pdb, argparse, subprocess, pickle, os, gzip, glob - -from SyncNetInstance import * - -# ==================== PARSE ARGUMENT ==================== - -parser = argparse.ArgumentParser(description = "SyncNet"); -parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help=''); -parser.add_argument('--batch_size', type=int, default='20', help=''); -parser.add_argument('--vshift', type=int, default='15', help=''); -parser.add_argument('--data_dir', type=str, default='data/work', help=''); -parser.add_argument('--videofile', type=str, default='', help=''); -parser.add_argument('--reference', type=str, default='', help=''); -opt = parser.parse_args(); - -setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi')) -setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp')) -setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork')) -setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop')) - - -# ==================== LOAD MODEL AND FILE LIST ==================== - -s = SyncNetInstance(); - -s.loadParameters(opt.initial_model); -print("Model %s loaded."%opt.initial_model); - -flist = glob.glob(os.path.join(opt.crop_dir,opt.reference,'0*.avi')) -flist.sort() - -# ==================== GET OFFSETS ==================== - -dists = [] -for idx, fname in enumerate(flist): - offset, conf, dist = s.evaluate(opt,videofile=fname) - dists.append(dist) - -# ==================== PRINT RESULTS TO FILE ==================== - -with open(os.path.join(opt.work_dir,opt.reference,'activesd.pckl'), 'wb') as fil: - pickle.dump(dists, fil) diff --git a/SyncNetInstance.py b/src/syncnet/SyncNetInstance.py similarity index 95% rename from SyncNetInstance.py rename to src/syncnet/SyncNetInstance.py index 497d44f..40edea7 100644 --- a/SyncNetInstance.py +++ b/src/syncnet/SyncNetInstance.py @@ -10,14 +10,14 @@ from scipy import signal from scipy.io import wavfile -from SyncNetModel import * +from syncnet.SyncNetModel import S from shutil import rmtree # ==================== Get OFFSET ==================== def calc_pdist(feat1, feat2, vshift=10): - + win_size = vshift*2+1 feat2p = torch.nn.functional.pad(feat2,(0,0,vshift,vshift)) @@ -52,18 +52,18 @@ def evaluate(self, opt, videofile): os.makedirs(os.path.join(opt.tmp_dir,opt.reference)) - command = ("ffmpeg -y -i %s -threads 1 -f image2 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'%06d.jpg'))) + command = ("ffmpeg -y -i %s -threads 1 -f image2 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'%06d.jpg'))) output = subprocess.call(command, shell=True, stdout=None) - command = ("ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'audio.wav'))) + command = ("ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'audio.wav'))) output = subprocess.call(command, shell=True, stdout=None) - + # ========== ========== - # Load video + # Load video # ========== ========== images = [] - + flist = glob.glob(os.path.join(opt.tmp_dir,opt.reference,'*.jpg')) flist.sort() @@ -95,7 +95,7 @@ def evaluate(self, opt, videofile): print("WARNING: Audio (%.4fs) and video (%.4fs) lengths are different."%(float(len(audio))/16000,float(len(images))/25)) min_length = min(len(images),math.floor(len(audio)/640)) - + # ========== ========== # Generate video and audio feats # ========== ========== @@ -106,7 +106,7 @@ def evaluate(self, opt, videofile): tS = time.time() for i in range(0,lastframe,opt.batch_size): - + im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ] im_in = torch.cat(im_batch,0) im_out = self.__S__.forward_lip(im_in.cuda()); @@ -123,7 +123,7 @@ def evaluate(self, opt, videofile): # ========== ========== # Compute offset # ========== ========== - + print('Compute time %.3f sec.' % (time.time()-tS)) dists = calc_pdist(im_feat,cc_feat,vshift=opt.vshift) @@ -138,7 +138,7 @@ def evaluate(self, opt, videofile): # fdist = numpy.pad(fdist, (3,3), 'constant', constant_values=15) fconf = torch.median(mdist).numpy() - fdist fconfm = signal.medfilt(fconf,kernel_size=9) - + numpy.set_printoptions(formatter={'float': '{: 0.3f}'.format}) print('Framewise conf: ') print(fconfm) @@ -150,9 +150,9 @@ def evaluate(self, opt, videofile): def extract_feature(self, opt, videofile): self.__S__.eval(); - + # ========== ========== - # Load video + # Load video # ========== ========== cap = cv2.VideoCapture(videofile) @@ -171,7 +171,7 @@ def extract_feature(self, opt, videofile): im = numpy.transpose(im,(0,3,4,1,2)) imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float()) - + # ========== ========== # Generate video feats # ========== ========== @@ -181,7 +181,7 @@ def extract_feature(self, opt, videofile): tS = time.time() for i in range(0,lastframe,opt.batch_size): - + im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ] im_in = torch.cat(im_batch,0) im_out = self.__S__.forward_lipfeat(im_in.cuda()); @@ -192,7 +192,7 @@ def extract_feature(self, opt, videofile): # ========== ========== # Compute offset # ========== ========== - + print('Compute time %.3f sec.' % (time.time()-tS)) return im_feat diff --git a/SyncNetModel.py b/src/syncnet/SyncNetModel.py similarity index 100% rename from SyncNetModel.py rename to src/syncnet/SyncNetModel.py diff --git a/detectors/README.md b/src/syncnet/detectors/README.md similarity index 100% rename from detectors/README.md rename to src/syncnet/detectors/README.md diff --git a/detectors/__init__.py b/src/syncnet/detectors/__init__.py similarity index 100% rename from detectors/__init__.py rename to src/syncnet/detectors/__init__.py diff --git a/detectors/s3fd/__init__.py b/src/syncnet/detectors/s3fd/__init__.py similarity index 95% rename from detectors/s3fd/__init__.py rename to src/syncnet/detectors/s3fd/__init__.py index d7f35e0..3abb288 100644 --- a/detectors/s3fd/__init__.py +++ b/src/syncnet/detectors/s3fd/__init__.py @@ -2,11 +2,11 @@ import numpy as np import cv2 import torch -from torchvision import transforms from .nets import S3FDNet from .box_utils import nms_ +import os -PATH_WEIGHT = './detectors/s3fd/weights/sfd_face.pth' +PATH_WEIGHT = os.path.join(os.environ["SYNCNET_MODEL_DIR"], "sfd_face.pth") img_mean = np.array([104., 117., 123.])[:, np.newaxis, np.newaxis].astype('float32') @@ -23,7 +23,7 @@ def __init__(self, device='cuda'): self.net.load_state_dict(state_dict) self.net.eval() print('[S3FD] finished loading (%.4f sec)' % (time.time() - tstamp)) - + def detect_faces(self, image, conf_th=0.8, scales=[1]): w, h = image.shape[1], image.shape[0] diff --git a/detectors/s3fd/box_utils.py b/src/syncnet/detectors/s3fd/box_utils.py similarity index 99% rename from detectors/s3fd/box_utils.py rename to src/syncnet/detectors/s3fd/box_utils.py index 0779bcd..701a8e5 100644 --- a/detectors/s3fd/box_utils.py +++ b/src/syncnet/detectors/s3fd/box_utils.py @@ -35,7 +35,7 @@ def nms_(dets, thresh): inds = np.where(ovr <= thresh)[0] order = order[inds + 1] - return np.array(keep).astype(np.int) + return np.array(keep).astype(np.int32) def decode(loc, priors, variances): diff --git a/detectors/s3fd/nets.py b/src/syncnet/detectors/s3fd/nets.py similarity index 100% rename from detectors/s3fd/nets.py rename to src/syncnet/detectors/s3fd/nets.py diff --git a/src/syncnet/run_all.py b/src/syncnet/run_all.py new file mode 100644 index 0000000..b3d8f35 --- /dev/null +++ b/src/syncnet/run_all.py @@ -0,0 +1,45 @@ +import os +import argparse +from syncnet.run_pipeline import run_pipeline +from syncnet.run_syncnet import run_syncnet +from tempfile import TemporaryDirectory +from pathlib import Path + + +def run_all(video_path: str) -> float: + syncnet_model_path = Path(os.environ["SYNCNET_MODEL_DIR"]) / "syncnet_v2.model" + assert syncnet_model_path.exists() + + with TemporaryDirectory() as tmp_dir: + pipeline_opts = argparse.Namespace( + data_dir=tmp_dir, + videofile=video_path, + reference="", + facedet_scale=0.25, + crop_scale=0.4, + min_track=100, + frame_rate=25, + num_failed_det=25, + min_face_size=100, + ) + run_pipeline(pipeline_opts) + + processed_video_path = Path(tmp_dir) / "pycrop" / "00000.avi" + assert processed_video_path.exists() + + syncnet_opts = argparse.Namespace( + initial_model=os.path.join(os.environ["SYNCNET_MODEL_DIR"], "syncnet_v2.model"), + batch_size=20, + vshift=1, + data_dir=tmp_dir, + videofile=str(processed_video_path), + reference="", + ) + return run_syncnet(syncnet_opts) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description = "SyncNet") + parser.add_argument("--video_path", type=str, required=True, help="Path to the video file") + args = parser.parse_args() + result = run_all(**vars(args)) diff --git a/run_pipeline.py b/src/syncnet/run_pipeline.py similarity index 57% rename from run_pipeline.py rename to src/syncnet/run_pipeline.py index f5fc22e..d784bcb 100755 --- a/run_pipeline.py +++ b/src/syncnet/run_pipeline.py @@ -1,13 +1,11 @@ #!/usr/bin/python -import sys, time, os, pdb, argparse, pickle, subprocess, glob, cv2 +import time, os, argparse, pickle, subprocess, glob, cv2 import numpy as np from shutil import rmtree -import scenedetect from scenedetect.video_manager import VideoManager from scenedetect.scene_manager import SceneManager -from scenedetect.frame_timecode import FrameTimecode from scenedetect.stats_manager import StatsManager from scenedetect.detectors import ContentDetector @@ -15,48 +13,28 @@ from scipy.io import wavfile from scipy import signal -from detectors import S3FD +from syncnet.detectors import S3FD -# ========== ========== ========== ========== -# # PARSE ARGS -# ========== ========== ========== ========== -parser = argparse.ArgumentParser(description = "FaceTracker"); -parser.add_argument('--data_dir', type=str, default='data/work', help='Output direcotry'); -parser.add_argument('--videofile', type=str, default='', help='Input video file'); -parser.add_argument('--reference', type=str, default='', help='Video reference'); -parser.add_argument('--facedet_scale', type=float, default=0.25, help='Scale factor for face detection'); -parser.add_argument('--crop_scale', type=float, default=0.40, help='Scale bounding box'); -parser.add_argument('--min_track', type=int, default=100, help='Minimum facetrack duration'); -parser.add_argument('--frame_rate', type=int, default=25, help='Frame rate'); -parser.add_argument('--num_failed_det', type=int, default=25, help='Number of missed detections allowed before tracking is stopped'); -parser.add_argument('--min_face_size', type=int, default=100, help='Minimum face size in pixels'); -opt = parser.parse_args(); - -setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi')) -setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp')) -setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork')) -setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop')) -setattr(opt,'frames_dir',os.path.join(opt.data_dir,'pyframes')) # ========== ========== ========== ========== # # IOU FUNCTION # ========== ========== ========== ========== def bb_intersection_over_union(boxA, boxB): - + xA = max(boxA[0], boxB[0]) yA = max(boxA[1], boxB[1]) xB = min(boxA[2], boxB[2]) yB = min(boxA[3], boxB[3]) - + interArea = max(0, xB - xA) * max(0, yB - yA) - + boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1]) boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1]) - + iou = interArea / float(boxAArea + boxBArea - interArea) - + return iou # ========== ========== ========== ========== @@ -87,7 +65,7 @@ def track_shot(opt,scenefaces): if track == []: break elif len(track) > opt.min_track: - + framenum = np.array([ f['frame'] for f in track ]) bboxes = np.array([np.array(f['bbox']) for f in track]) @@ -107,7 +85,7 @@ def track_shot(opt,scenefaces): # ========== ========== ========== ========== # # VIDEO CROP AND SAVE # ========== ========== ========== ========== - + def crop_video(opt,track,cropfile): flist = glob.glob(os.path.join(opt.frames_dir,opt.reference,'*.jpg')) @@ -120,12 +98,12 @@ def crop_video(opt,track,cropfile): for det in track['bbox']: - dets['s'].append(max((det[3]-det[1]),(det[2]-det[0]))/2) - dets['y'].append((det[1]+det[3])/2) # crop center x + dets['s'].append(max((det[3]-det[1]),(det[2]-det[0]))/2) + dets['y'].append((det[1]+det[3])/2) # crop center x dets['x'].append((det[0]+det[2])/2) # crop center y # Smooth detections - dets['s'] = signal.medfilt(dets['s'],kernel_size=13) + dets['s'] = signal.medfilt(dets['s'],kernel_size=13) dets['x'] = signal.medfilt(dets['x'],kernel_size=13) dets['y'] = signal.medfilt(dets['y'],kernel_size=13) @@ -134,16 +112,16 @@ def crop_video(opt,track,cropfile): cs = opt.crop_scale bs = dets['s'][fidx] # Detection box size - bsi = int(bs*(1+2*cs)) # Pad videos by this amount + bsi = int(bs*(1+2*cs)) # Pad videos by this amount image = cv2.imread(flist[frame]) - + frame = np.pad(image,((bsi,bsi),(bsi,bsi),(0,0)), 'constant', constant_values=(110,110)) my = dets['y'][fidx]+bsi # BBox center Y mx = dets['x'][fidx]+bsi # BBox center X face = frame[int(my-bs):int(my+bs*(1+2*cs)),int(mx-bs*(1+cs)):int(mx+bs*(1+cs))] - + vOut.write(cv2.resize(face,(224,224))) audiotmp = os.path.join(opt.tmp_dir,opt.reference,'audio.wav') @@ -154,21 +132,15 @@ def crop_video(opt,track,cropfile): # ========== CROP AUDIO FILE ========== - command = ("ffmpeg -y -i %s -ss %.3f -to %.3f %s" % (os.path.join(opt.avi_dir,opt.reference,'audio.wav'),audiostart,audioend,audiotmp)) - output = subprocess.call(command, shell=True, stdout=None) - - if output != 0: - pdb.set_trace() + command = ("ffmpeg -y -i %s -ss %.3f -to %.3f %s" % (os.path.join(opt.avi_dir,opt.reference,'audio.wav'),audiostart,audioend,audiotmp)) + subprocess.run(command, shell=True, stdout=None, check=True) sample_rate, audio = wavfile.read(audiotmp) # ========== COMBINE AUDIO AND VIDEO FILES ========== command = ("ffmpeg -y -i %st.avi -i %s -c:v copy -c:a copy %s.avi" % (cropfile,audiotmp,cropfile)) - output = subprocess.call(command, shell=True, stdout=None) - - if output != 0: - pdb.set_trace() + subprocess.run(command, shell=True, stdout=None, check=True) print('Written %s'%cropfile) @@ -190,11 +162,11 @@ def inference_video(opt): flist.sort() dets = [] - + for fidx, fname in enumerate(flist): start_time = time.time() - + image = cv2.imread(fname) image_np = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) @@ -206,7 +178,7 @@ def inference_video(opt): elapsed_time = time.time() - start_time - print('%s-%05d; %d dets; %.2f Hz' % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),fidx,len(dets[-1]),(1/elapsed_time))) + print('%s-%05d; %d dets; %.2f Hz' % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),fidx,len(dets[-1]),(1/elapsed_time))) savepath = os.path.join(opt.work_dir,opt.reference,'faces.pckl') @@ -247,76 +219,100 @@ def scene_detect(opt): print('%s - scenes detected %d'%(os.path.join(opt.avi_dir,opt.reference,'video.avi'),len(scene_list))) return scene_list - -# ========== ========== ========== ========== -# # EXECUTE DEMO -# ========== ========== ========== ========== +def run_pipeline(opt): -# ========== DELETE EXISTING DIRECTORIES ========== + setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi')) + setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp')) + setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork')) + setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop')) + setattr(opt,'frames_dir',os.path.join(opt.data_dir,'pyframes')) -if os.path.exists(os.path.join(opt.work_dir,opt.reference)): - rmtree(os.path.join(opt.work_dir,opt.reference)) + # ========== ========== ========== ========== + # # EXECUTE DEMO + # ========== ========== ========== ========== -if os.path.exists(os.path.join(opt.crop_dir,opt.reference)): - rmtree(os.path.join(opt.crop_dir,opt.reference)) + # ========== DELETE EXISTING DIRECTORIES ========== -if os.path.exists(os.path.join(opt.avi_dir,opt.reference)): - rmtree(os.path.join(opt.avi_dir,opt.reference)) + if os.path.exists(os.path.join(opt.work_dir,opt.reference)): + rmtree(os.path.join(opt.work_dir,opt.reference)) -if os.path.exists(os.path.join(opt.frames_dir,opt.reference)): - rmtree(os.path.join(opt.frames_dir,opt.reference)) + if os.path.exists(os.path.join(opt.crop_dir,opt.reference)): + rmtree(os.path.join(opt.crop_dir,opt.reference)) -if os.path.exists(os.path.join(opt.tmp_dir,opt.reference)): - rmtree(os.path.join(opt.tmp_dir,opt.reference)) + if os.path.exists(os.path.join(opt.avi_dir,opt.reference)): + rmtree(os.path.join(opt.avi_dir,opt.reference)) + + if os.path.exists(os.path.join(opt.frames_dir,opt.reference)): + rmtree(os.path.join(opt.frames_dir,opt.reference)) + + if os.path.exists(os.path.join(opt.tmp_dir,opt.reference)): + rmtree(os.path.join(opt.tmp_dir,opt.reference)) -# ========== MAKE NEW DIRECTORIES ========== + # ========== MAKE NEW DIRECTORIES ========== -os.makedirs(os.path.join(opt.work_dir,opt.reference)) -os.makedirs(os.path.join(opt.crop_dir,opt.reference)) -os.makedirs(os.path.join(opt.avi_dir,opt.reference)) -os.makedirs(os.path.join(opt.frames_dir,opt.reference)) -os.makedirs(os.path.join(opt.tmp_dir,opt.reference)) + os.makedirs(os.path.join(opt.work_dir,opt.reference)) + os.makedirs(os.path.join(opt.crop_dir,opt.reference)) + os.makedirs(os.path.join(opt.avi_dir,opt.reference)) + os.makedirs(os.path.join(opt.frames_dir,opt.reference)) + os.makedirs(os.path.join(opt.tmp_dir,opt.reference)) -# ========== CONVERT VIDEO AND EXTRACT FRAMES ========== + # ========== CONVERT VIDEO AND EXTRACT FRAMES ========== -command = ("ffmpeg -y -i %s -qscale:v 2 -async 1 -r 25 %s" % (opt.videofile,os.path.join(opt.avi_dir,opt.reference,'video.avi'))) -output = subprocess.call(command, shell=True, stdout=None) + command = ("ffmpeg -y -i %s -qscale:v 2 -async 1 -r 25 %s" % (opt.videofile,os.path.join(opt.avi_dir,opt.reference,'video.avi'))) + output = subprocess.run(command, shell=True, stdout=None, check=True) -command = ("ffmpeg -y -i %s -qscale:v 2 -threads 1 -f image2 %s" % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),os.path.join(opt.frames_dir,opt.reference,'%06d.jpg'))) -output = subprocess.call(command, shell=True, stdout=None) + command = ("ffmpeg -y -i %s -qscale:v 2 -threads 1 -f image2 %s" % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),os.path.join(opt.frames_dir,opt.reference,'%06d.jpg'))) + output = subprocess.run(command, shell=True, stdout=None, check=True) -command = ("ffmpeg -y -i %s -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),os.path.join(opt.avi_dir,opt.reference,'audio.wav'))) -output = subprocess.call(command, shell=True, stdout=None) + command = ("ffmpeg -y -i %s -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (os.path.join(opt.avi_dir,opt.reference,'video.avi'),os.path.join(opt.avi_dir,opt.reference,'audio.wav'))) + output = subprocess.run(command, shell=True, stdout=None, check=True) -# ========== FACE DETECTION ========== + # ========== FACE DETECTION ========== -faces = inference_video(opt) + faces = inference_video(opt) -# ========== SCENE DETECTION ========== + # ========== SCENE DETECTION ========== -scene = scene_detect(opt) + scene = scene_detect(opt) -# ========== FACE TRACKING ========== + # ========== FACE TRACKING ========== -alltracks = [] -vidtracks = [] + alltracks = [] + vidtracks = [] -for shot in scene: + for shot in scene: - if shot[1].frame_num - shot[0].frame_num >= opt.min_track : - alltracks.extend(track_shot(opt,faces[shot[0].frame_num:shot[1].frame_num])) + if shot[1].frame_num - shot[0].frame_num >= opt.min_track : + alltracks.extend(track_shot(opt,faces[shot[0].frame_num:shot[1].frame_num])) -# ========== FACE TRACK CROP ========== + # ========== FACE TRACK CROP ========== -for ii, track in enumerate(alltracks): - vidtracks.append(crop_video(opt,track,os.path.join(opt.crop_dir,opt.reference,'%05d'%ii))) + for ii, track in enumerate(alltracks): + vidtracks.append(crop_video(opt,track,os.path.join(opt.crop_dir,opt.reference,'%05d'%ii))) + + # ========== SAVE RESULTS ========== + + savepath = os.path.join(opt.work_dir,opt.reference,'tracks.pckl') + + with open(savepath, 'wb') as fil: + pickle.dump(vidtracks, fil) + + rmtree(os.path.join(opt.tmp_dir,opt.reference)) -# ========== SAVE RESULTS ========== -savepath = os.path.join(opt.work_dir,opt.reference,'tracks.pckl') +if __name__ == "__main__": -with open(savepath, 'wb') as fil: - pickle.dump(vidtracks, fil) + parser = argparse.ArgumentParser(description = "FaceTracker") + parser.add_argument('--data_dir', type=str, default='data/work', help='Output direcotry') + parser.add_argument('--videofile', type=str, default='', help='Input video file') + parser.add_argument('--reference', type=str, default='', help='Video reference') + parser.add_argument('--facedet_scale', type=float, default=0.25, help='Scale factor for face detection') + parser.add_argument('--crop_scale', type=float, default=0.40, help='Scale bounding box') + parser.add_argument('--min_track', type=int, default=100, help='Minimum facetrack duration') + parser.add_argument('--frame_rate', type=int, default=25, help='Frame rate') + parser.add_argument('--num_failed_det', type=int, default=25, help='Number of missed detections allowed before tracking is stopped') + parser.add_argument('--min_face_size', type=int, default=100, help='Minimum face size in pixels') + opt = parser.parse_args() -rmtree(os.path.join(opt.tmp_dir,opt.reference)) + run_pipeline(opt) \ No newline at end of file diff --git a/src/syncnet/run_syncnet.py b/src/syncnet/run_syncnet.py new file mode 100755 index 0000000..584a061 --- /dev/null +++ b/src/syncnet/run_syncnet.py @@ -0,0 +1,53 @@ +#!/usr/bin/python +#-*- coding: utf-8 -*- + +import argparse, os, glob + +from syncnet.SyncNetInstance import SyncNetInstance +import numpy as np + + +def run_syncnet(opt) -> float: + setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi')) + setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp')) + setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork')) + setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop')) + + # ==================== LOAD MODEL AND FILE LIST ==================== + + s = SyncNetInstance() + + s.loadParameters(opt.initial_model) + print("Model %s loaded."%opt.initial_model) + + flist = glob.glob(os.path.join(opt.crop_dir,opt.reference,'0*.avi')) + flist.sort() + + # ==================== GET OFFSETS ==================== + + dists = [] + for idx, fname in enumerate(flist): + *_, dist = s.evaluate(opt,videofile=fname) + dists.append(dist) + + dists: np.ndarray = np.asarray(dists)[0] + no_time_shift_idx = dists.shape[-1] // 2 + dists = dists[:, no_time_shift_idx] + result = dists.mean() + print(f"Syncnet mean distance: {result}") + return result + + +if __name__ == "__main__": + initial_model = os.path.join(os.environ["SYNCNET_MODEL_DIR"], "syncnet_v2.model") + + parser = argparse.ArgumentParser(description = "SyncNet") + parser.add_argument('--initial_model', type=str, default=initial_model, help='') + parser.add_argument('--batch_size', type=int, default='20', help='') + parser.add_argument('--vshift', type=int, default='1', help='') + parser.add_argument('--data_dir', type=str, default='data/work', help='') + parser.add_argument('--videofile', type=str, default='', help='') + parser.add_argument('--reference', type=str, default='', help='') + opt = parser.parse_args() + + run_syncnet(opt)