1
1
import cv2
2
- import mediapipe as mp
3
2
import numpy as np
4
3
5
4
import math
10
9
from typing import List
11
10
from typing import Tuple
12
11
13
- # Code to overlay the face mesh point taken from https://google.github.io/mediapipe/solutions/holistic.html
14
- mp_drawing = mp .solutions .drawing_utils
12
+ # Extracts the face mesh data from the frames of a video using MediaPipe.
13
+ # See: https://ai.google.dev/edge/mediapipe/solutions/vision/face_landmarker
14
+
15
+ # Code to overlay the face mesh point taken from https://colab.research.google.com/github/googlesamples/mediapipe/blob/main/examples/face_landmarker/python/%5BMediaPipe_Python_Tasks%5D_Face_Landmarker.ipynb
16
+ import mediapipe as mp
17
+ from mediapipe .tasks import python as mp_python
18
+ from mediapipe .tasks .python import vision as mp_vision
19
+ VisionRunningMode = mp .tasks .vision .RunningMode
20
+
21
+
22
+
23
+ MEDIAPIPE_FACE_LANDMARKS_COUNT = 478
15
24
16
- MEDIAPIPE_FACE_LANDMARKS_COUNT = 468
25
+ MEDIAPIPE_FACE_BLENDSHAPES_COUNT = 52
17
26
18
27
# Vertices numbers derived from uv texture or from FBX model.
19
28
# Vertices on the front. Will be usd to compute the reference horizontal vector
28
37
VERTEX_ID_NOSE_BASE = 168
29
38
VERTEX_ID_NOSE_TIP = 4
30
39
31
- VERTICES_TO_DRAW = {
40
+ VERTICES_TO_HIGHLIGHT = {
32
41
VERTEX_ID_FRONT_TOP_RIGHT , VERTEX_ID_FRONT_TOP_LEFT ,
33
42
VERTEX_ID_EAR_TOP_R , VERTEX_ID_EAR_TOP_L ,
34
43
VERTEX_ID_JAW_BASE_R , VERTEX_ID_JAW_BASE_L ,
37
46
38
47
39
48
def vec_len (a : np .ndarray ) -> float :
49
+ """Computes the length of a vector."""
40
50
41
51
a = np .power (a , 2 )
42
52
a = a .sum ()
@@ -47,7 +57,7 @@ def vec_len(a: np.ndarray) -> float:
47
57
48
58
def normalize_face_landmarks (landmarks : List [List [float ]], frame_width_px : int , frame_height_px : int ,
49
59
nose_translation : np .ndarray , rot_mat : np .ndarray , scale : float ) -> List [List [float ]]:
50
- """Performs a normalizatiopn of the orientation of the Mediapipe face landmarks using forehead and lateral
60
+ """Performs a normalization of the orientation of the Mediapipe face landmarks using forehead and lateral
51
61
keypoints to build a rigid reference system
52
62
"""
53
63
@@ -187,9 +197,18 @@ def extract_face_data(frames_in: VideoFrameProducer,
187
197
"""
188
198
189
199
# For video input:
190
- mp_face_mesh = mp .solutions .face_mesh
191
- face_mesh = mp_face_mesh .FaceMesh (min_detection_confidence = 0.7 , min_tracking_confidence = 0.5 ,
192
- static_image_mode = False , refine_landmarks = False )
200
+ #mp_face_mesh = mp.solutions.face_mesh
201
+ #face_mesh = mp_face_mesh.FaceMesh(min_detection_confidence=0.7, min_tracking_confidence=0.5,
202
+ # static_image_mode=False, refine_landmarks=False)
203
+
204
+ #
205
+ # Initialize the Mediapipe Face Landmarker
206
+ base_options = mp_python .BaseOptions (model_asset_path = 'models/face_landmarker.task' )
207
+ options = mp_vision .FaceLandmarkerOptions (base_options = base_options ,
208
+ output_face_blendshapes = True ,
209
+ output_facial_transformation_matrixes = True ,
210
+ num_faces = 1 )
211
+ face_landmarker = mp_vision .FaceLandmarker .create_from_options (options )
193
212
194
213
# Will store the H and W of the input video frame
195
214
width = None
@@ -210,33 +229,41 @@ def extract_face_data(frames_in: VideoFrameProducer,
210
229
height = rgb_image .shape [0 ]
211
230
212
231
# To improve performance, optionally mark the image as not writeable to pass by reference.
213
- rgb_image . flags . writeable = False
214
- results = face_mesh . process ( rgb_image )
232
+ mp_image = mp . Image ( image_format = mp . ImageFormat . SRGB , data = rgb_image )
233
+ results = face_landmarker . detect ( mp_image )
215
234
216
235
# Check if at least one face is available
217
- if results .multi_face_landmarks is None :
236
+ if len ( results .face_landmarks ) == 0 :
218
237
landmarks = None
219
238
# We just fill in the data with NaNs
220
- lm_list = [[float ('nan' )] * 3 ] * MEDIAPIPE_FACE_LANDMARKS_COUNT
239
+ orig_frame_lm_list = [[float ('nan' )] * 3 ] * MEDIAPIPE_FACE_LANDMARKS_COUNT
240
+ out_frame_lm_list = [[float ('nan' )] * 3 ] * MEDIAPIPE_FACE_LANDMARKS_COUNT
221
241
nose_tip = np .asarray ([float ('nan' )] * 3 , dtype = np .float32 )
222
242
R = np .asarray ([float ('nan' )] * 9 , dtype = np .float32 ).reshape (3 , 3 )
223
243
scale = float ('nan' )
224
244
else :
225
245
226
246
# Assume there is only one face
227
- landmarks = results .multi_face_landmarks [0 ]
247
+ landmarks = results .face_landmarks [0 ]
248
+
249
+ assert len (landmarks ) == MEDIAPIPE_FACE_LANDMARKS_COUNT
250
+
228
251
229
252
# Map the list of landmarks into a bi-dimensional array (and convert it into a list)
230
- lm_list = list (map (lambda l : [l .x , l .y , l .z ], landmarks . landmark ))
253
+ orig_frame_lm_list = list (map (lambda l : [l .x , l .y , l .z ], landmarks ))
231
254
232
- nose_tip , R , scale = compute_normalization_params (landmarks = lm_list , frame_width_px = width , frame_height_px = height )
255
+ nose_tip , R , scale = compute_normalization_params (landmarks = orig_frame_lm_list , frame_width_px = width , frame_height_px = height )
233
256
234
257
if normalize_landmarks :
235
- lm_list = normalize_face_landmarks (landmarks = lm_list , frame_width_px = width , frame_height_px = height ,
258
+ out_frame_lm_list = normalize_face_landmarks (landmarks = orig_frame_lm_list , frame_width_px = width , frame_height_px = height ,
236
259
nose_translation = nose_tip , rot_mat = R , scale = scale )
260
+ else :
261
+ out_frame_lm_list = orig_frame_lm_list
237
262
238
- assert type (lm_list ) == list
239
- assert len (lm_list ) == MEDIAPIPE_FACE_LANDMARKS_COUNT
263
+ assert type (orig_frame_lm_list ) == list
264
+ assert type (out_frame_lm_list ) == list
265
+ assert len (orig_frame_lm_list ) == MEDIAPIPE_FACE_LANDMARKS_COUNT
266
+ assert len (out_frame_lm_list ) == MEDIAPIPE_FACE_LANDMARKS_COUNT
240
267
assert type (nose_tip ) == np .ndarray
241
268
assert nose_tip .shape == (3 ,)
242
269
assert type (R ) == np .ndarray
@@ -249,7 +276,7 @@ def extract_face_data(frames_in: VideoFrameProducer,
249
276
out_scales = np .append (out_scales , [np .float32 (scale )], axis = 0 )
250
277
251
278
# Append to frames container
252
- out_landmarks_list .append (lm_list )
279
+ out_landmarks_list .append (out_frame_lm_list )
253
280
254
281
#
255
282
# Manage composite video output
@@ -261,21 +288,33 @@ def extract_face_data(frames_in: VideoFrameProducer,
261
288
# Draw face mesh landmarks on the overlay image.
262
289
if landmarks is not None :
263
290
264
- # Let's use 1 pixel radius every 500 pixels of video. Can be 0, but it is OK.
265
- norm_landmark_radius = int (width / 500 )
291
+ # Let's use 1 pixel radius every 500 pixels of video.
292
+ norm_landmark_radius = max ( 1 , int (width / 600 ) )
266
293
# Set the thickness as the same as the radius.
267
294
norm_landmark_thickness = norm_landmark_radius
268
- # Drawing specifications for MediaPipe
269
- drawing_specs = mp_drawing .DrawingSpec (color = mp_drawing .RED_COLOR ,
270
- circle_radius = norm_landmark_radius ,
271
- thickness = norm_landmark_thickness )
272
-
273
- # print('face_landmarks:', face_landmarks)
274
- mp_drawing .draw_landmarks (
275
- image = annotated_image ,
276
- landmark_list = landmarks ,
277
- landmark_drawing_spec = drawing_specs
278
- )
295
+
296
+ #
297
+ # Draw the landmarks over the face
298
+ for i , lm in enumerate (orig_frame_lm_list ):
299
+ lm_x , lm_y , lm_z = lm [:]
300
+
301
+ # If a coordinate is NaN, it's because the face was not found
302
+ if math .isnan (lm_x ):
303
+ continue
304
+
305
+ # As the landmarks are already normalized in a range [0,1],
306
+ # bring them to the half of the output frame resolution
307
+
308
+ lm_x *= width
309
+ lm_y *= height
310
+
311
+ if i in VERTICES_TO_HIGHLIGHT :
312
+ vcol = (20 , 220 , 220 )
313
+ else :
314
+ vcol = (20 , 20 , 220 )
315
+
316
+ cv2 .circle (img = annotated_image , center = (int (lm_x ), int (lm_y )), radius = norm_landmark_radius ,
317
+ color = vcol , thickness = norm_landmark_thickness )
279
318
280
319
#
281
320
# DEBUG: save the landmarks to a file
@@ -284,18 +323,17 @@ def extract_face_data(frames_in: VideoFrameProducer,
284
323
# pickle.dump(obj=lm_list, file=outfile)
285
324
286
325
#
287
- # Print landmarks with custom routine
288
- # Fill the upper left quarter of the image using a orthographic projection (i.e., use only x and y)
326
+ # Draw the landmarks in the upper left corner of the image using a orthographic projection (i.e., use only x and y)
289
327
# and we use the depth to modulate the color intensity.
290
328
291
329
# First compute the dynamic range of the z coordinate among all points
292
- zs = [p [2 ] for p in lm_list ]
330
+ zs = [p [2 ] for p in out_frame_lm_list ]
293
331
z_min = min (zs )
294
332
z_max = max (zs )
295
333
z_range = z_max - z_min
296
334
297
335
# Draw the landmarks
298
- for i , lm in enumerate (lm_list ):
336
+ for i , lm in enumerate (out_frame_lm_list ):
299
337
lm_x , lm_y , lm_z = lm [:]
300
338
301
339
# If a coordinate is NaN, it's because the face was not found
@@ -304,9 +342,9 @@ def extract_face_data(frames_in: VideoFrameProducer,
304
342
305
343
# As the landmarks are already normalized in a range [0,1],
306
344
# bring them to the half of the output frame resolution
307
-
308
345
lm_x *= width / 2
309
346
lm_y *= height / 2
347
+ # rescale z in [0,1]
310
348
norm_z = 1 - ((lm_z - z_min ) / z_range )
311
349
312
350
cv2 .circle (img = annotated_image , center = (int (lm_x ), int (lm_y )), radius = norm_landmark_radius ,
0 commit comments