Skip to content

Commit 84ba324

Browse files
committed
Switched from mediapipe solution FaceMesh to FaceLandmarker
1 parent e6bab3b commit 84ba324

File tree

4 files changed

+88
-40
lines changed

4 files changed

+88
-40
lines changed

Changelog.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,12 @@ Types of changes:
1717

1818
## [Unreleased]
1919

20-
2120
### Changed
2221

22+
- Switched from old mediapipe 0.8 Facemesh solution to new 0.10 FaceLandmarker
23+
- Number of landmasks increased from 468 to 478! Output shape of the landmark infomation numpy file is changed!
24+
25+
2326
## [0.3] - 2025-08-14
2427

2528
- Switch to MediaPipe 0.10.x (from 0.8.9: API changes)

models/README.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Put here the external models:
2+
3+
* Media Pipe Face Landmarker `face_landmarker.task` (from: https://ai.google.dev/edge/mediapipe/solutions/vision/face_landmarker)

slvideotools/extract_face_data.py

Lines changed: 76 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import cv2
2-
import mediapipe as mp
32
import numpy as np
43

54
import math
@@ -10,10 +9,20 @@
109
from typing import List
1110
from typing import Tuple
1211

13-
# Code to overlay the face mesh point taken from https://google.github.io/mediapipe/solutions/holistic.html
14-
mp_drawing = mp.solutions.drawing_utils
12+
# Extracts the face mesh data from the frames of a video using MediaPipe.
13+
# See: https://ai.google.dev/edge/mediapipe/solutions/vision/face_landmarker
14+
15+
# Code to overlay the face mesh point taken from https://colab.research.google.com/github/googlesamples/mediapipe/blob/main/examples/face_landmarker/python/%5BMediaPipe_Python_Tasks%5D_Face_Landmarker.ipynb
16+
import mediapipe as mp
17+
from mediapipe.tasks import python as mp_python
18+
from mediapipe.tasks.python import vision as mp_vision
19+
VisionRunningMode = mp.tasks.vision.RunningMode
20+
21+
22+
23+
MEDIAPIPE_FACE_LANDMARKS_COUNT = 478
1524

16-
MEDIAPIPE_FACE_LANDMARKS_COUNT = 468
25+
MEDIAPIPE_FACE_BLENDSHAPES_COUNT = 52
1726

1827
# Vertices numbers derived from uv texture or from FBX model.
1928
# Vertices on the front. Will be usd to compute the reference horizontal vector
@@ -28,7 +37,7 @@
2837
VERTEX_ID_NOSE_BASE = 168
2938
VERTEX_ID_NOSE_TIP = 4
3039

31-
VERTICES_TO_DRAW = {
40+
VERTICES_TO_HIGHLIGHT = {
3241
VERTEX_ID_FRONT_TOP_RIGHT, VERTEX_ID_FRONT_TOP_LEFT,
3342
VERTEX_ID_EAR_TOP_R, VERTEX_ID_EAR_TOP_L,
3443
VERTEX_ID_JAW_BASE_R, VERTEX_ID_JAW_BASE_L,
@@ -37,6 +46,7 @@
3746

3847

3948
def vec_len(a: np.ndarray) -> float:
49+
"""Computes the length of a vector."""
4050

4151
a = np.power(a, 2)
4252
a = a.sum()
@@ -47,7 +57,7 @@ def vec_len(a: np.ndarray) -> float:
4757

4858
def normalize_face_landmarks(landmarks: List[List[float]], frame_width_px: int, frame_height_px: int,
4959
nose_translation: np.ndarray, rot_mat: np.ndarray, scale: float) -> List[List[float]]:
50-
"""Performs a normalizatiopn of the orientation of the Mediapipe face landmarks using forehead and lateral
60+
"""Performs a normalization of the orientation of the Mediapipe face landmarks using forehead and lateral
5161
keypoints to build a rigid reference system
5262
"""
5363

@@ -187,9 +197,18 @@ def extract_face_data(frames_in: VideoFrameProducer,
187197
"""
188198

189199
# For video input:
190-
mp_face_mesh = mp.solutions.face_mesh
191-
face_mesh = mp_face_mesh.FaceMesh(min_detection_confidence=0.7, min_tracking_confidence=0.5,
192-
static_image_mode=False, refine_landmarks=False)
200+
#mp_face_mesh = mp.solutions.face_mesh
201+
#face_mesh = mp_face_mesh.FaceMesh(min_detection_confidence=0.7, min_tracking_confidence=0.5,
202+
# static_image_mode=False, refine_landmarks=False)
203+
204+
#
205+
# Initialize the Mediapipe Face Landmarker
206+
base_options = mp_python.BaseOptions(model_asset_path='models/face_landmarker.task')
207+
options = mp_vision.FaceLandmarkerOptions(base_options=base_options,
208+
output_face_blendshapes=True,
209+
output_facial_transformation_matrixes=True,
210+
num_faces=1)
211+
face_landmarker = mp_vision.FaceLandmarker.create_from_options(options)
193212

194213
# Will store the H and W of the input video frame
195214
width = None
@@ -210,33 +229,41 @@ def extract_face_data(frames_in: VideoFrameProducer,
210229
height = rgb_image.shape[0]
211230

212231
# To improve performance, optionally mark the image as not writeable to pass by reference.
213-
rgb_image.flags.writeable = False
214-
results = face_mesh.process(rgb_image)
232+
mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb_image)
233+
results = face_landmarker.detect(mp_image)
215234

216235
# Check if at least one face is available
217-
if results.multi_face_landmarks is None:
236+
if len(results.face_landmarks) == 0:
218237
landmarks = None
219238
# We just fill in the data with NaNs
220-
lm_list = [[float('nan')] * 3] * MEDIAPIPE_FACE_LANDMARKS_COUNT
239+
orig_frame_lm_list = [[float('nan')] * 3] * MEDIAPIPE_FACE_LANDMARKS_COUNT
240+
out_frame_lm_list = [[float('nan')] * 3] * MEDIAPIPE_FACE_LANDMARKS_COUNT
221241
nose_tip = np.asarray([float('nan')] * 3, dtype=np.float32)
222242
R = np.asarray([float('nan')] * 9, dtype=np.float32).reshape(3, 3)
223243
scale = float('nan')
224244
else:
225245

226246
# Assume there is only one face
227-
landmarks = results.multi_face_landmarks[0]
247+
landmarks = results.face_landmarks[0]
248+
249+
assert len(landmarks) == MEDIAPIPE_FACE_LANDMARKS_COUNT
250+
228251

229252
# Map the list of landmarks into a bi-dimensional array (and convert it into a list)
230-
lm_list = list(map(lambda l: [l.x, l.y, l.z], landmarks.landmark))
253+
orig_frame_lm_list = list(map(lambda l: [l.x, l.y, l.z], landmarks))
231254

232-
nose_tip, R, scale = compute_normalization_params(landmarks=lm_list, frame_width_px=width, frame_height_px=height)
255+
nose_tip, R, scale = compute_normalization_params(landmarks=orig_frame_lm_list, frame_width_px=width, frame_height_px=height)
233256

234257
if normalize_landmarks:
235-
lm_list = normalize_face_landmarks(landmarks=lm_list, frame_width_px=width, frame_height_px=height,
258+
out_frame_lm_list = normalize_face_landmarks(landmarks=orig_frame_lm_list, frame_width_px=width, frame_height_px=height,
236259
nose_translation=nose_tip, rot_mat=R, scale=scale)
260+
else:
261+
out_frame_lm_list = orig_frame_lm_list
237262

238-
assert type(lm_list) == list
239-
assert len(lm_list) == MEDIAPIPE_FACE_LANDMARKS_COUNT
263+
assert type(orig_frame_lm_list) == list
264+
assert type(out_frame_lm_list) == list
265+
assert len(orig_frame_lm_list) == MEDIAPIPE_FACE_LANDMARKS_COUNT
266+
assert len(out_frame_lm_list) == MEDIAPIPE_FACE_LANDMARKS_COUNT
240267
assert type(nose_tip) == np.ndarray
241268
assert nose_tip.shape == (3,)
242269
assert type(R) == np.ndarray
@@ -249,7 +276,7 @@ def extract_face_data(frames_in: VideoFrameProducer,
249276
out_scales = np.append(out_scales, [np.float32(scale)], axis=0)
250277

251278
# Append to frames container
252-
out_landmarks_list.append(lm_list)
279+
out_landmarks_list.append(out_frame_lm_list)
253280

254281
#
255282
# Manage composite video output
@@ -261,21 +288,33 @@ def extract_face_data(frames_in: VideoFrameProducer,
261288
# Draw face mesh landmarks on the overlay image.
262289
if landmarks is not None:
263290

264-
# Let's use 1 pixel radius every 500 pixels of video. Can be 0, but it is OK.
265-
norm_landmark_radius = int(width / 500)
291+
# Let's use 1 pixel radius every 500 pixels of video.
292+
norm_landmark_radius = max(1, int(width / 600))
266293
# Set the thickness as the same as the radius.
267294
norm_landmark_thickness = norm_landmark_radius
268-
# Drawing specifications for MediaPipe
269-
drawing_specs = mp_drawing.DrawingSpec(color=mp_drawing.RED_COLOR,
270-
circle_radius=norm_landmark_radius,
271-
thickness=norm_landmark_thickness)
272-
273-
# print('face_landmarks:', face_landmarks)
274-
mp_drawing.draw_landmarks(
275-
image=annotated_image,
276-
landmark_list=landmarks,
277-
landmark_drawing_spec=drawing_specs
278-
)
295+
296+
#
297+
# Draw the landmarks over the face
298+
for i, lm in enumerate(orig_frame_lm_list):
299+
lm_x, lm_y, lm_z = lm[:]
300+
301+
# If a coordinate is NaN, it's because the face was not found
302+
if math.isnan(lm_x):
303+
continue
304+
305+
# As the landmarks are already normalized in a range [0,1],
306+
# bring them to the half of the output frame resolution
307+
308+
lm_x *= width
309+
lm_y *= height
310+
311+
if i in VERTICES_TO_HIGHLIGHT:
312+
vcol = (20, 220, 220)
313+
else:
314+
vcol = (20, 20, 220)
315+
316+
cv2.circle(img=annotated_image, center=(int(lm_x), int(lm_y)), radius=norm_landmark_radius,
317+
color=vcol, thickness=norm_landmark_thickness)
279318

280319
#
281320
# DEBUG: save the landmarks to a file
@@ -284,18 +323,17 @@ def extract_face_data(frames_in: VideoFrameProducer,
284323
# pickle.dump(obj=lm_list, file=outfile)
285324

286325
#
287-
# Print landmarks with custom routine
288-
# Fill the upper left quarter of the image using a orthographic projection (i.e., use only x and y)
326+
# Draw the landmarks in the upper left corner of the image using a orthographic projection (i.e., use only x and y)
289327
# and we use the depth to modulate the color intensity.
290328

291329
# First compute the dynamic range of the z coordinate among all points
292-
zs = [p[2] for p in lm_list]
330+
zs = [p[2] for p in out_frame_lm_list]
293331
z_min = min(zs)
294332
z_max = max(zs)
295333
z_range = z_max - z_min
296334

297335
# Draw the landmarks
298-
for i, lm in enumerate(lm_list):
336+
for i, lm in enumerate(out_frame_lm_list):
299337
lm_x, lm_y, lm_z = lm[:]
300338

301339
# If a coordinate is NaN, it's because the face was not found
@@ -304,9 +342,9 @@ def extract_face_data(frames_in: VideoFrameProducer,
304342

305343
# As the landmarks are already normalized in a range [0,1],
306344
# bring them to the half of the output frame resolution
307-
308345
lm_x *= width / 2
309346
lm_y *= height / 2
347+
# rescale z in [0,1]
310348
norm_z = 1 - ((lm_z - z_min) / z_range)
311349

312350
cv2.circle(img=annotated_image, center=(int(lm_x), int(lm_y)), radius=norm_landmark_radius,

slvideotools/test_api.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020

2121
TEST_VIDEO_PATH = pkg_resources.resource_filename("slvideotools.data", "testvideo.mp4")
2222

23+
from .extract_face_data import MEDIAPIPE_FACE_LANDMARKS_COUNT
24+
2325

2426
def test_trimming(tmp_path):
2527
#
@@ -147,7 +149,7 @@ def test_face_data_extraction(tmp_path):
147149

148150
assert len(landmarks_data.shape) == 3
149151
assert landmarks_data.shape[0] == n_frames
150-
assert landmarks_data.shape[1] == 468
152+
assert landmarks_data.shape[1] == MEDIAPIPE_FACE_LANDMARKS_COUNT
151153
assert landmarks_data.shape[2] == 3
152154
assert landmarks_data.dtype == np.float32
153155

@@ -181,6 +183,8 @@ def test_motion_energy_computation(tmp_path):
181183
FRAME_START = 10
182184
FRAME_END = 100
183185

186+
assert n_frames > FRAME_END, "Test video must have more frames than FRAME_END"
187+
184188
with create_frame_producer(dir_or_video=TEST_VIDEO_PATH) as frame_prod:
185189

186190
out_video_path = os.path.join(tmp_path, "motion_energy.mp4")

0 commit comments

Comments
 (0)