This repository was archived by the owner on Jan 23, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 12
Expand file tree
/
Copy pathensemble.py
More file actions
156 lines (133 loc) · 5.92 KB
/
ensemble.py
File metadata and controls
156 lines (133 loc) · 5.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import numpy as np
import yaml
import gc
import torch
import torch.nn.functional as F
from torchvision.models.video import mc3_18, r2plus1d_18
from facenet_pytorch import MTCNN
from .cnn3d import *
from .cnn2d import *
from .constants import *
from .face_utils import *
__all__ = ['Ensemble']
def inference_3d(models_3d, faces, video):
predictions = []
try:
for modeldict in models_3d:
preds_video = []
model = modeldict['model']
for i_face, face in enumerate(faces):
(frame_from, frame_to), (row_from,
row_to), (col_from, col_to) = face
x = video[frame_from:frame_to,
row_from:row_to + 1, col_from:col_to + 1]
x = resize_and_square_face(x, output_size=OUTPUT_FACE_SIZE)
if PRE_INFERENCE_CROP and PRE_INFERENCE_CROP != OUTPUT_FACE_SIZE:
x = center_crop_video(x, PRE_INFERENCE_CROP)
with torch.no_grad():
if modeldict['norm'] == '112_imagenet':
x = np.array([test_transforms_114_imagenet(
image=frame)['image'] for frame in x])
elif modeldict['norm'] == '224_imagenet':
x = np.array([test_transforms_224_imagenet(
image=frame)['image'] for frame in x])
x = torch.from_numpy(x.transpose([3, 0, 1, 2])).float()
if modeldict['norm'] == 'i3d':
x = (x / 255.) * 2 - 1
elif modeldict['norm'] == 'nil':
pass
elif modeldict['norm'] == '112_imagenet':
pass
elif modeldict['norm'] == '224_imagenet':
pass
else:
raise ValueError(
f"Unknown normalisation mode {modeldict['norm']}")
y_pred = model(x.cuda())
prob0, prob1 = torch.mean(
torch.exp(F.log_softmax(y_pred, dim=1)), dim=0)
if REVERSE_PROBS:
prob1 = 1-prob1
preds_video.append(float(prob1))
if len(preds_video) > 0:
predictions.append(USE_FACE_FUNCTION(preds_video) * RATIO_3D)
except:
pass
return predictions
def inference_2d(model_2d, faces, coords, video, loader):
try:
FRAMES2D = 32
# Ian's 2D model
# coords = coords_by_videopath[videopath]
preds_video = []
for i_coord, coordinate in enumerate(coords):
(frame_from, frame_to), (row_from,
row_to), (col_from, col_to) = faces[i_coord]
x = []
for coord_ind, frame_number in enumerate(range(frame_from, min(frame_from+FRAMES2D, frame_to-1))):
if coord_ind >= len(coordinate):
break
x1, y1, x2, y2 = coordinate[coord_ind]
x.append(video[frame_number, y1:y2, x1:x2])
x = np.asarray(x)
# Reverse back to BGR because it will get reversed to RGB when preprocessed
# x = x[...,::-1]
# Preprocess
x = loader.dataset.process_video(x)
# x = np.asarray([loader.dataset.process_image(_) for _ in x])
# Flip every other frame
x[:, ::2] = x[:, ::2, :, ::-1]
# RGB reverse every 3rd frame
# x[:,::3] = x[::-1,::3]
with torch.no_grad():
out = model_2d(torch.from_numpy(
np.ascontiguousarray(x)).unsqueeze(0).cuda())
# out = np.median(out.cpu().numpy())
preds_video.append(out.cpu().numpy())
if len(preds_video) > 0:
return USE_FACE_FUNCTION(preds_video) * RATIO_2D
else:
pass
except:
pass
class Ensemble():
def __init__(self):
self.mtcnn = MTCNN(margin=0, keep_all=True, post_process=False, select_largest=False,
device='cuda:0', thresholds=MTCNN_THRESHOLDS, factor=MMTNN_FACTOR)
self.models_3d = build_models()
with open('./medics/cnn2d/experiment001.yaml') as f:
CFG = yaml.load(f, Loader=yaml.FullLoader)
CFG['model']['params']['pretrained'] = None
model2d = build_model(CFG['model']['name'], CFG['model']['params'])
model2d.load_state_dict(torch.load(
'./weights/medics/SRXT50_094_VM-0.2504.PTH'))
model2d = model2d.eval().cuda()
self.model_2d = model2d
self.loader = build_dataloader(
CFG, data_info={'vidfiles': [], 'labels': []}, mode='predict')
def inference(self, video_path):
faces, coords = face_detection_wrapper(self.mtcnn, video_path, every_n_frames=FACE_FRAMES,
facedetection_downsample=FACEDETECTION_DOWNSAMPLE,
max_frames_to_load=MAX_FRAMES_TO_LOAD)
if len(faces):
last_frame_needed = get_last_frame_needed_across_faces(faces)
video, rescale = load_video(video_path, every_n_frames=1, to_rgb=True, rescale=None,
inc_pil=False, max_frames=last_frame_needed)
else:
return 0.5
predictions = inference_3d(self.models_3d, faces, video)
preds_2d = inference_2d(self.model_2d, faces,
coords, video, self.loader)
if preds_2d:
predictions.append(preds_2d)
if len(predictions) > 0:
return np.clip(np.mean(predictions), PROB_MIN, PROB_MAX)
else:
return 0.5
def __del__(self):
del self.mtcnn
del self.models_3d
del self.model_2d
del self.loader
torch.cuda.empty_cache()
gc.collect()