This repository was archived by the owner on Jan 23, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 12
Expand file tree
/
Copy pathface_utils.py
More file actions
370 lines (304 loc) · 14.5 KB
/
face_utils.py
File metadata and controls
370 lines (304 loc) · 14.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
import numpy as np
from PIL import Image
import cv2
import math
import skimage.measure
from .constants import *
def load_video(filename, every_n_frames=None, specific_frames=None, to_rgb=True, rescale=None, inc_pil=False, max_frames=None):
"""Loads a video.
Called by:
1) The finding faces algorithm where it pulls a frame every FACE_FRAMES frames up to MAX_FRAMES_TO_LOAD at a scale of FACEDETECTION_DOWNSAMPLE, and then half that if there's a CUDA memory error.
2) The inference loop where it pulls EVERY frame up to a certain amount which it the last needed frame for each face for that video"""
assert every_n_frames or specific_frames, "Must supply either every n_frames or specific_frames"
assert bool(every_n_frames) != bool(
specific_frames), "Supply either 'every_n_frames' or 'specific_frames', not both"
cap = cv2.VideoCapture(filename)
n_frames_in = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
width_in = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height_in = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
if rescale:
rescale = rescale * 1920./np.max((width_in, height_in))
width_out = int(width_in*rescale) if rescale else width_in
height_out = int(height_in*rescale) if rescale else height_in
if max_frames:
n_frames_in = min(n_frames_in, max_frames)
if every_n_frames:
specific_frames = list(range(0, n_frames_in, every_n_frames))
n_frames_out = len(specific_frames)
out_pil = []
out_video = np.empty(
(n_frames_out, height_out, width_out, 3), np.dtype('uint8'))
i_frame_in = 0
i_frame_out = 0
ret = True
while (i_frame_in < n_frames_in and ret):
try:
try:
if every_n_frames == 1:
ret, frame_in = cap.read() # Faster if reading all frames
else:
ret = cap.grab()
if i_frame_in not in specific_frames:
i_frame_in += 1
continue
ret, frame_in = cap.retrieve()
# print(f"Reading frame {i_frame_in}")
if rescale:
frame_in = cv2.resize(frame_in, (width_out, height_out))
if to_rgb:
frame_in = cv2.cvtColor(frame_in, cv2.COLOR_BGR2RGB)
except Exception as e:
print(
f"Error for frame {i_frame_in} for video {filename}: {e}; using 0s")
frame_in = np.zeros((height_out, width_out, 3))
out_video[i_frame_out] = frame_in
i_frame_out += 1
if inc_pil:
try: # https://www.kaggle.com/zaharch/public-test-errors
pil_img = Image.fromarray(frame_in)
except Exception as e:
print(
f"Using a blank frame for video {filename} frame {i_frame_in} as error {e}")
pil_img = Image.fromarray(
np.zeros((224, 224, 3), dtype=np.uint8)) # Use a blank frame
out_pil.append(pil_img)
i_frame_in += 1
except Exception as e:
print(f"Error for file {filename}: {e}")
cap.release()
if inc_pil:
return out_video, out_pil, rescale
else:
return out_video, rescale
def get_roi_for_each_face(faces_by_frame, probs, video_shape, temporal_upsample, upsample=1):
# Create boolean face array
frames_video, rows_video, cols_video, channels_video = video_shape
frames_video = math.ceil(frames_video)
boolean_face_3d = np.zeros(
(frames_video, rows_video, cols_video), dtype=np.bool) # Remove colour channel
proba_face_3d = np.zeros(
(frames_video, rows_video, cols_video)).astype('float32')
for i_frame, faces in enumerate(faces_by_frame):
if faces is not None: # May not be a face in the frame
for i_face, face in enumerate(faces):
left, top, right, bottom = face
boolean_face_3d[i_frame, int(top):int(
bottom), int(left):int(right)] = True
proba_face_3d[i_frame, int(top):int(bottom), int(
left):int(right)] = probs[i_frame][i_face]
# Replace blank frames if face(s) in neighbouring frames with overlap
for i_frame, frame in enumerate(boolean_face_3d):
if i_frame == 0 or i_frame == frames_video-1: # Can't do this for 1st or last frame
continue
if True not in frame:
if TWO_FRAME_OVERLAP:
if i_frame > 1:
pre_overlap = boolean_face_3d[i_frame -
1] | boolean_face_3d[i_frame-2]
else:
pre_overlap = boolean_face_3d[i_frame-1]
if i_frame < frames_video-2:
post_overlap = boolean_face_3d[i_frame +
1] | boolean_face_3d[i_frame+2]
else:
post_overlap = boolean_face_3d[i_frame+1]
neighbour_overlap = pre_overlap & post_overlap
else:
neighbour_overlap = boolean_face_3d[i_frame -
1] & boolean_face_3d[i_frame+1]
boolean_face_3d[i_frame] = neighbour_overlap
# Find faces through time
id_face_3d, n_faces = skimage.measure.label(
boolean_face_3d, return_num=True)
region_labels, counts = np.unique(id_face_3d, return_counts=True)
# Get rid of background=0
region_labels, counts = region_labels[1:], counts[1:]
###################
# DESCENDING SIZE #
###################
descending_size = np.argsort(counts)[::-1]
labels_by_size = region_labels[descending_size]
####################
# DESCENDING PROBS #
####################
probs = [np.mean(proba_face_3d[id_face_3d == i_face])
for i_face in region_labels]
descending_probs = np.argsort(probs)[::-1]
labels_by_probs = region_labels[descending_probs]
# Iterate over faces in video
rois = []
face_maps = []
for i_face in labels_by_probs: # labels_by_size:
# Find the first and last frame containing the face
frames = np.where(np.any(id_face_3d == i_face, axis=(1, 2)) == True)
starting_frame, ending_frame = frames[0].min(), frames[0].max()
# Iterate over the frames with faces in and find the min/max cols/rows (bounding box)
cols, rows = [], []
for i_frame in range(starting_frame, ending_frame + 1):
rs = np.where(
np.any(id_face_3d[i_frame] == i_face, axis=1) == True)
rows.append((rs[0].min(), rs[0].max()))
cs = np.where(
np.any(id_face_3d[i_frame] == i_face, axis=0) == True)
cols.append((cs[0].min(), cs[0].max()))
frame_from, frame_to = starting_frame * \
temporal_upsample, ((ending_frame+1)*temporal_upsample)-1
rows_from, rows_to = np.array(
rows)[:, 0].min(), np.array(rows)[:, 1].max()
cols_from, cols_to = np.array(
cols)[:, 0].min(), np.array(cols)[:, 1].max()
frame_to = min(frame_to, frame_from + MAX_FRAMES_FOR_FACE)
if frame_to - frame_from >= MIN_FRAMES_FOR_FACE:
tmp_face_map = id_face_3d.copy()
tmp_face_map[tmp_face_map != i_face] = 0
tmp_face_map[tmp_face_map == i_face] = 1
face_maps.append(
tmp_face_map[frame_from//temporal_upsample:frame_to//temporal_upsample+1])
rois.append(((frame_from, frame_to),
(int(rows_from*upsample), int(rows_to*upsample)),
(int(cols_from*upsample), int(cols_to*upsample))))
return np.array(rois), face_maps
def get_center(bbox):
x1, y1, x2, y2 = bbox
return (x1+x2)/2, (y1+y2)/2
def get_coords(faces_roi):
coords = np.argwhere(faces_roi == 1)
# print(coords)
if coords.shape[0] == 0:
return None
y1, x1 = coords[0]
y2, x2 = coords[-1]
return x1, y1, x2, y2
def interpolate_center(c1, c2, length):
x1, y1 = c1
x2, y2 = c2
xi, yi = np.linspace(x1, x2, length), np.linspace(y1, y2, length)
return np.vstack([xi, yi]).transpose(1, 0)
def get_faces(faces_roi, upsample):
all_faces = []
rows = faces_roi[0].shape[1]
cols = faces_roi[0].shape[2]
for i in range(len(faces_roi)):
faces = np.asarray([get_coords(faces_roi[i][j])
for j in range(len(faces_roi[i]))])
if faces[0] is None:
faces[0] = faces[1]
if faces[-1] is None:
faces[-1] = faces[-2]
if None in faces:
# print(faces)
raise Exception('This should not have happened ...')
all_faces.append(faces)
extracted_faces = []
for face in all_faces:
# Get max dim size
max_dim = np.concatenate(
[face[:, 2]-face[:, 0], face[:, 3]-face[:, 1]])
max_dim = np.percentile(max_dim, 90)
# Enlarge by 1.2
max_dim = int(max_dim * 1.2)
# Get center coords
centers = np.asarray([get_center(_) for _ in face])
# Interpolate
centers = np.vstack([interpolate_center(
centers[i], centers[i+1], length=10) for i in range(len(centers)-1)]).astype('int')
x1y1 = centers - max_dim // 2
x2y2 = centers + max_dim // 2
x1, y1 = x1y1[:, 0], x1y1[:, 1]
x2, y2 = x2y2[:, 0], x2y2[:, 1]
# If x1 or y1 is negative, turn it to 0
# Then add to x2 y2 or y2
x2[x1 < 0] -= x1[x1 < 0]
y2[y1 < 0] -= y1[y1 < 0]
x1[x1 < 0] = 0
y1[y1 < 0] = 0
# If x2 or y2 is too big, turn it to max image shape
# Then subtract from y1
y1[y2 > rows] += rows - y2[y2 > rows]
x1[x2 > cols] += cols - x2[x2 > cols]
y2[y2 > rows] = rows
x2[x2 > cols] = cols
vidface = np.asarray([[x1[_], y1[_], x2[_], y2[_]]
for _, c in enumerate(centers)])
vidface = (vidface*upsample).astype('int')
extracted_faces.append(vidface)
return extracted_faces
def detect_face_with_mtcnn(mtcnn_model, pil_frames, facedetection_upsample, video_shape, face_frames):
boxes, _probs = mtcnn_model.detect(pil_frames, landmarks=False)
faces, faces_roi = get_roi_for_each_face(
faces_by_frame=boxes, probs=_probs, video_shape=video_shape, temporal_upsample=face_frames, upsample=facedetection_upsample)
coords = [] if len(faces_roi) == 0 else get_faces(
faces_roi, upsample=facedetection_upsample)
return faces, coords
def face_detection_wrapper(mtcnn_model, videopath, every_n_frames, facedetection_downsample, max_frames_to_load):
video, pil_frames, rescale = load_video(videopath, every_n_frames=every_n_frames, to_rgb=True,
rescale=facedetection_downsample, inc_pil=True, max_frames=max_frames_to_load)
if len(pil_frames):
try:
faces, coords = detect_face_with_mtcnn(mtcnn_model=mtcnn_model,
pil_frames=pil_frames,
facedetection_upsample=1/rescale,
video_shape=video.shape,
face_frames=every_n_frames)
except RuntimeError: # Out of CUDA RAM
print(f"Failed to process {videopath} ! Downsampling x2 ...")
video, pil_frames, rescale = load_video(videopath, every_n_frames=every_n_frames, to_rgb=True,
rescale=facedetection_downsample/2, inc_pil=True, max_frames=max_frames_to_load)
try:
faces, coords = detect_face_with_mtcnn(mtcnn_model=mtcnn_model,
pil_frames=pil_frames,
facedetection_upsample=1/rescale,
video_shape=video.shape,
face_frames=every_n_frames)
except RuntimeError:
print(f"Failed on downsample ! Skipping...")
return [], []
else:
print('Failed to fetch frames ! Skipping ...')
return [], []
if len(faces) == 0:
print('Failed to find faces ! Upsampling x2 ...')
try:
video, pil_frames, rescale = load_video(videopath, every_n_frames=every_n_frames, to_rgb=True,
rescale=facedetection_downsample*2, inc_pil=True, max_frames=max_frames_to_load)
faces, coords = detect_face_with_mtcnn(mtcnn_model=mtcnn_model,
pil_frames=pil_frames,
facedetection_upsample=1/rescale,
video_shape=video.shape,
face_frames=every_n_frames)
except Exception as e:
print(e)
return [], []
return faces, coords
def get_last_frame_needed_across_faces(faces):
last_frame = 0
for face in faces:
(frame_from, frame_to), (row_from, row_to), (col_from, col_to) = face
last_frame = max(frame_to, last_frame)
return last_frame
def resize_and_square_face(video, output_size):
# We will square it, so this is the effective input size
input_size = max(video.shape[1], video.shape[2])
out_video = np.empty(
(len(video), output_size[0], output_size[1], 3), np.dtype('uint8'))
for i_frame, frame in enumerate(video):
padded_image = np.zeros((input_size, input_size, 3))
padded_image[0:frame.shape[0], 0:frame.shape[1]] = frame
if (input_size, input_size) != output_size:
frame = cv2.resize(
padded_image, (output_size[0], output_size[1])).astype(np.uint8)
else:
frame = padded_image.astype(np.uint8)
out_video[i_frame] = frame
return out_video
def center_crop_video(video, crop_dimensions):
height, width = video.shape[1], video.shape[2]
crop_height, crop_width = crop_dimensions
y1 = (height - crop_height) // 2
y2 = y1 + crop_height
x1 = (width - crop_width) // 2
x2 = x1 + crop_width
video_out = np.zeros((len(video), crop_height, crop_width, 3))
for i_frame, frame in enumerate(video):
video_out[i_frame] = frame[y1:y2, x1:x2]
return video_out