Skip to content

Commit 110ae09

Browse files
authored
Hands and Palm detection support (#3092)
1 parent 06755d0 commit 110ae09

File tree

17 files changed

+766
-20
lines changed

17 files changed

+766
-20
lines changed

tools/accuracy_checker/openvino/tools/accuracy_checker/adapters/README.md

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,10 @@ AccuracyChecker supports following set of adapters:
217217
* `u_output` - U channel output layer.
218218
* `v_output` - V channel output layer.
219219
* `target_color` - taret color space for super resolution image - `bgr` and `rgb` are supported. (Optional, default `bgr`).
220-
* `landmarks_regression` - converting output of model for landmarks regression to `FacialLandmarksPrediction`.
220+
* `landmarks_regression` - converting output of model for landmarks regression to `FacialLandmarksPrediction` or `HandLandmarksPrediction`.
221+
* `landmarks_out` - landmarks output layer.
222+
* `landmarks_step` - number of coordinates per landmark (optional, default `2`).
223+
* `is_hand_landmarks` - allows conversion to `HandLandmarksPrediction` instead of `FacialLandmarksPrediction` (optional, default `False`).
221224
* `pixel_link_text_detection` - converting output of PixelLink like model for text detection to `TextDetectionPrediction`.
222225
* `pixel_class_out` - name of layer containing information related to text/no-text classification for each pixel.
223226
* `pixel_link_out` - name of layer containing information related to linkage between pixels and their neighbors.
@@ -504,3 +507,26 @@ AccuracyChecker supports following set of adapters:
504507
* `reg_max` - maximal value of integral set (optional, default 7).
505508
* `strides` - strides of input multi-level feature maps (optional, default [8, 16, 32]).
506509
* `is_legacy` - using a legacy NanoDet model (optional, default False).
510+
* `palm_detection` - converting output of palm detection model to `DetectionPrediction` representation.
511+
* `scores_out` - name of `scores` model output.
512+
* `boxes_out` - name of `boxes` model output.
513+
* `num_anchor_layers` - number of layers for anchors calculation (optional, default `4`).
514+
* `strides` - strides of input multi-level feature maps (optional, default `[8, 16, 16, 16]`).
515+
* `min_scale` - minimal scale for anchors calculation (optional, default `0.1484375`).
516+
* `max_scale` - maximal scale for anchors calculation (optional, default `0.75`).
517+
* `input_size_width` - width of a model input image (optional, default `128`).
518+
* `input_size_height` - height of a model input image (optional, default `128`).
519+
* `reduce_boxes_in_lowest_layer` - reduce size of anchors in lowest layer (optional, default `False`).
520+
* `aspect_ratios` - Aspect ratios for multi-level feature maps (optional, default `[1]`).
521+
* `inteprolated_scale_aspect_ratio` - aspect ratio for interpolated scale (optional, default `1`).
522+
* `fixed_anchor_size` - produces anchors with fixed size (optional, default 'True').
523+
* `sigmoid_score` - score output is sigmoid (optional, default 'True').
524+
* `score_clipping_thresh` - score clipping threshold (optional, default `100`).
525+
* `reverse_output_order` - `boxes` output data order is (x,y) instead of (y,x) (optional, default `True`).
526+
* `keypoint_coord_offset` - offset of keypoints coordinates in `boxes` output (optional, default `4`).
527+
* `num_keypoints` - Number of keypoints in `boxes` output(optional, default `7`).
528+
* `num_values_per_keypoint` - Number of coordinates per keypoint (optional, default `2`).
529+
* `scales` - detection box scales for x,y,w,h. (optional, default `[128, 128, 128, 128]`).
530+
* `min_score_thresh` - lower bound for valid boxes scores (optional, default `0.5`).
531+
* `apply_exp_on_box_size` - box sizes is argument of exponent (optional, default `False`).
532+
* `num_classes` - number of detection classes (optional, default `1`).

tools/accuracy_checker/openvino/tools/accuracy_checker/adapters/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,8 @@
130130

131131
from .time_series import QuantilesPredictorAdapter
132132

133+
from .palm_detection import PalmDetectionAdapter
134+
133135
__all__ = [
134136
'Adapter',
135137
'AdapterField',
@@ -276,4 +278,6 @@
276278
'ActionRecognitionWithNoAction',
277279

278280
'ImageBackgroundMattingAdapter',
281+
282+
'PalmDetectionAdapter'
279283
]

tools/accuracy_checker/openvino/tools/accuracy_checker/adapters/attributes_recognition.py

Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,13 @@
1717
import numpy as np
1818

1919
from ..adapters import Adapter
20-
from ..config import ConfigValidator, StringField, PathField
20+
from ..config import ConfigValidator, StringField, PathField, NumberField, BoolField
2121
from ..representation import (
2222
ContainerPrediction,
2323
RegressionPrediction,
2424
ClassificationPrediction,
2525
FacialLandmarksPrediction,
26+
HandLandmarksPrediction,
2627
MultiLabelRecognitionPrediction,
2728
GazeVectorPrediction,
2829
FacialLandmarks3DPrediction
@@ -269,18 +270,49 @@ def select_output_blob(self, outputs):
269270

270271
class LandmarksRegressionAdapter(Adapter):
271272
__provider__ = 'landmarks_regression'
272-
prediction_types = (FacialLandmarksPrediction, )
273+
prediction_types = (FacialLandmarksPrediction, HandLandmarksPrediction)
274+
275+
@classmethod
276+
def parameters(cls):
277+
parameters = super().parameters()
278+
parameters.update({
279+
'landmarks_out': StringField(description="Output layer name for landmarks recognition.", optional=True),
280+
'landmarks_step': NumberField(description='Number of data per landmark point', optional=True, default=2,
281+
value_type=int),
282+
'is_hand_landmarks': BoolField(description="Model predicts hand landmarks", optional=True,
283+
default=False),
284+
})
285+
return parameters
286+
287+
def configure(self):
288+
self.landmarks_out = self.get_value_from_config('landmarks_out')
289+
self.landmarks_step = self.get_value_from_config('landmarks_step')
290+
self.is_hand_landmarks = self.get_value_from_config('is_hand_landmarks')
291+
self.output_verified = False
273292

274293
def process(self, raw, identifiers=None, frame_meta=None):
275294
res = []
276295
raw_output = self._extract_predictions(raw, frame_meta)
277-
self.select_output_blob(raw_output)
278-
for identifier, values in zip(identifiers, raw_output[self.output_blob]):
279-
x_values, y_values = values[::2], values[1::2]
280-
res.append(FacialLandmarksPrediction(identifier, x_values.reshape(-1), y_values.reshape(-1)))
296+
if not self.output_verified:
297+
self.select_output_blob(raw_output)
298+
prediction = raw_output[self.landmarks_out]
299+
for identifier, values in zip(identifiers, prediction):
300+
x_values, y_values = values[::self.landmarks_step], values[1::self.landmarks_step]
301+
if self.is_hand_landmarks:
302+
res.append(HandLandmarksPrediction(identifier, x_values.reshape(-1), y_values.reshape(-1)))
303+
else:
304+
res.append(FacialLandmarksPrediction(identifier, x_values.reshape(-1), y_values.reshape(-1)))
281305

282306
return res
283307

308+
def select_output_blob(self, outputs):
309+
self.output_verified = True
310+
if self.landmarks_out:
311+
self.landmarks_out = self.check_output_name(self.landmarks_out, outputs)
312+
return
313+
314+
super().select_output_blob(outputs)
315+
self.landmarks_out = self.output_blob
284316

285317
class PersonAttributesAdapter(Adapter):
286318
__provider__ = 'person_attributes'
Lines changed: 242 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,242 @@
1+
"""
2+
Copyright (c) 2018-2022 Intel Corporation
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
"""
16+
17+
import numpy as np
18+
19+
from ..adapters import Adapter
20+
from ..config import StringField, NumberField, BoolField, ListField
21+
from ..representation import DetectionPrediction
22+
23+
24+
class PalmDetectionAdapter(Adapter):
25+
__provider__ = 'palm_detection'
26+
27+
@classmethod
28+
def parameters(cls):
29+
params = super().parameters()
30+
params.update({
31+
'scores_out': StringField(description='scores output'),
32+
'boxes_out': StringField(description='boxes output'),
33+
'num_anchor_layers': NumberField(
34+
description="Number of anchor layers", value_type=int, min_value=0, default=4, optional=True),
35+
'strides': ListField(value_type=int, optional=True, default=[8, 16, 16, 16],
36+
description='strides of input multi-level feature maps'),
37+
'min_scale': NumberField(description="Minimal scale", default=0.1484375, optional=True),
38+
'max_scale': NumberField(description="Maximal scale", default=0.75, optional=True),
39+
'input_size_width': NumberField(
40+
description="Width of a model input image.", value_type=int, min_value=128, default=128, optional=True),
41+
'input_size_height': NumberField(
42+
description="Width of a model input image.", value_type=int, min_value=128, default=128, optional=True),
43+
'reduce_boxes_in_lowest_layer': BoolField(
44+
description="Reduce size of anchors in lowest layer", default=False, optional=True),
45+
'aspect_ratios': ListField(value_type=int, optional=True, default=[1],
46+
description='Aspect ratios of for each level of input multi-level feature maps'),
47+
'inteprolated_scale_aspect_ratio': NumberField(
48+
description="Aspect ratio for interpolated scale", default=1, optional=True),
49+
'fixed_anchor_size': BoolField(
50+
description="Produces anchors with fixed size", default=True, optional=True),
51+
'sigmoid_score': BoolField(description="Score output is sigmoid", default=True, optional=True),
52+
'score_clipping_thresh': NumberField(
53+
description="Score clipping threshold", default=100, optional=True),
54+
'reverse_output_order': BoolField(
55+
description="(x,y) coordinates order instead of (y,x)", default=True, optional=True),
56+
'keypoint_coord_offset': NumberField(
57+
description="Offset of keypoints coordinates", value_type=int, min_value=4, default=4, optional=True),
58+
'num_keypoints': NumberField(
59+
description="Number of keypoints", value_type=int, min_value=0, default=7, optional=True),
60+
'num_values_per_keypoint': NumberField(
61+
description="Number of coordinates per keypoint",
62+
value_type=int, min_value=0, default=2, optional=True),
63+
'scales': ListField(
64+
description='Detection box scales for x,y,w,h.', value_type=int, optional=True,
65+
default=[128, 128, 128, 128]),
66+
'min_score_thresh': NumberField(description="Minimal score threshold", default=0.5, optional=True),
67+
'apply_exp_on_box_size': BoolField(
68+
description="Box sizes is argument of exponent", default=False, optional=True),
69+
'num_classes': NumberField(
70+
description="Number of classes.", value_type=int, min_value=0, default=1, optional=True),
71+
})
72+
return params
73+
74+
def configure(self):
75+
self.scores_out = self.get_value_from_config('scores_out')
76+
self.boxes_out = self.get_value_from_config('boxes_out')
77+
self.outputs_verified = False
78+
79+
self.num_anchor_layers = self.get_value_from_config('num_anchor_layers')
80+
self.min_scale = self.get_value_from_config('min_scale')
81+
self.max_scale = self.get_value_from_config('max_scale')
82+
self.input_size_height = self.get_value_from_config('input_size_height')
83+
self.input_size_width = self.get_value_from_config('input_size_width')
84+
self.strides = self.get_value_from_config('strides')
85+
self.reduce_boxes_in_lowest_layer = self.get_value_from_config('reduce_boxes_in_lowest_layer')
86+
self.inteprolated_scale_aspect_ratio = self.get_value_from_config('inteprolated_scale_aspect_ratio')
87+
self.fixed_anchor_size = self.get_value_from_config('fixed_anchor_size')
88+
self.aspect_ratios = self.get_value_from_config('aspect_ratios')
89+
self.anchor_offset_x = 0.5
90+
self.anchor_offset_y = 0.5
91+
self.feature_map_height = []
92+
self.feature_map_width = []
93+
self.anchors = self.generate_anchors()
94+
95+
self.sigmoid_score = self.get_value_from_config('sigmoid_score')
96+
self.score_clipping_thresh = self.get_value_from_config('score_clipping_thresh')
97+
self.reverse_output_order = self.get_value_from_config('reverse_output_order')
98+
self.keypoint_coord_offset = self.get_value_from_config('keypoint_coord_offset')
99+
self.num_keypoints = self.get_value_from_config('num_keypoints')
100+
self.num_values_per_keypoint = self.get_value_from_config('num_values_per_keypoint')
101+
scales = self.get_value_from_config('scales')
102+
assert len(scales) == 4
103+
self.x_scale, self.y_scale, self.w_scale, self.h_scale = scales
104+
self.min_score_thresh = self.get_value_from_config('min_score_thresh')
105+
self.apply_exp_on_box_size = self.get_value_from_config('apply_exp_on_box_size')
106+
self.num_classes = self.get_value_from_config('num_classes')
107+
108+
def select_output_blob(self, outputs):
109+
self.scores_out = self.check_output_name(self.scores_out, outputs)
110+
self.boxes_out = self.check_output_name(self.boxes_out, outputs)
111+
self.outputs_verified = True
112+
113+
def process(self, raw, identifiers, frame_meta):
114+
result = []
115+
raw_output = self._extract_predictions(raw, frame_meta)
116+
if not self.outputs_verified:
117+
self.select_output_blob(raw_output)
118+
119+
for identifier, raw_scores, raw_boxes in zip(identifiers, raw_output[self.scores_out],
120+
raw_output[self.boxes_out]):
121+
num_boxes, _ = raw_boxes.shape
122+
boxes = self.decode_boxes(raw_boxes)
123+
detection_scores = np.zeros(num_boxes)
124+
detection_classes = np.zeros(num_boxes)
125+
126+
for i in range(num_boxes):
127+
class_id = -1
128+
max_score = -np.inf
129+
for score_idx in range(self.num_classes):
130+
score = raw_scores[i, score_idx]
131+
if self.sigmoid_score:
132+
if self.score_clipping_thresh:
133+
score = np.clip(score, (-1) * self.score_clipping_thresh, self.score_clipping_thresh)
134+
score = 1 / (1 + np.exp((-1) * score))
135+
if max_score < score:
136+
max_score = score
137+
class_id = score_idx
138+
detection_classes[i] = class_id
139+
detection_scores[i] = max_score
140+
cond = detection_scores >= self.min_score_thresh
141+
boxes = np.array(boxes)[cond]
142+
detection_classes = detection_classes[cond]
143+
detection_scores = detection_scores[cond]
144+
145+
cond = ((boxes[:, 2] - boxes[:, 0]) >= 0) & ((boxes[:, 3] - boxes[:, 1]) >= 0)
146+
147+
boxes = boxes[cond, :]
148+
detection_classes = detection_classes[cond]
149+
detection_scores = detection_scores[cond]
150+
151+
y_mins, x_mins, y_maxs, x_maxs = boxes.T[:4, :]
152+
153+
result.append(DetectionPrediction(identifier, detection_classes, detection_scores,
154+
x_mins, y_mins, x_maxs, y_maxs))
155+
156+
return result
157+
158+
@staticmethod
159+
def calculate_scale(min_scale, max_scale, stride_index, num_strides):
160+
return (min_scale +
161+
max_scale) * 0.5 if num_strides == 1 else min_scale + (max_scale -
162+
min_scale) * stride_index / (num_strides - 1)
163+
164+
def generate_anchors(self):
165+
anchors = []
166+
layer_id = 0
167+
while layer_id < self.num_anchor_layers:
168+
anchor_height = []
169+
anchor_width = []
170+
aspect_ratios = []
171+
scales = []
172+
173+
last_same_stride_layer = layer_id
174+
while last_same_stride_layer < len(self.strides) and (self.strides[last_same_stride_layer] ==
175+
self.strides[layer_id]):
176+
scale = self.calculate_scale(self.min_scale, self.max_scale, last_same_stride_layer, len(self.strides))
177+
ar_and_s = zip([1, 2, 0.5], [0.1, scale, scale]) if (
178+
last_same_stride_layer == 0) and self.reduce_boxes_in_lowest_layer else zip(
179+
self.aspect_ratios, [scale] * len(self.aspect_ratios))
180+
for aspect_ratio, scale_ in ar_and_s:
181+
aspect_ratios.append(aspect_ratio)
182+
scales.append(scale_)
183+
184+
if self.inteprolated_scale_aspect_ratio > 0:
185+
scale_next = 1 if last_same_stride_layer == len(self.strides) - 1 else self.calculate_scale(
186+
self.min_scale, self.max_scale, last_same_stride_layer + 1, len(self.strides))
187+
scales.append(np.sqrt(scale * scale_next))
188+
aspect_ratios.append(self.inteprolated_scale_aspect_ratio)
189+
last_same_stride_layer += 1
190+
191+
for aspect_ratio, scale in zip(aspect_ratios, scales):
192+
anchor_height.append(scale / np.sqrt(aspect_ratio))
193+
anchor_width.append(scale * np.sqrt(aspect_ratio))
194+
195+
feature_map_height = self.feature_map_height[layer_id] if self.feature_map_height else int(
196+
np.ceil(self.input_size_height / self.strides[layer_id]))
197+
feature_map_width = self.feature_map_width[layer_id] if self.feature_map_height else int(
198+
np.ceil(self.input_size_width / self.strides[layer_id]))
199+
200+
for y in range(feature_map_height):
201+
for x in range(feature_map_width):
202+
for anchor_w, anchor_h in zip(anchor_width, anchor_height):
203+
anchor = [(x + self.anchor_offset_x) / feature_map_width,
204+
(y + self.anchor_offset_y) / feature_map_height,
205+
1 if self.fixed_anchor_size else anchor_w,
206+
1 if self.fixed_anchor_size else anchor_h]
207+
208+
anchors.append(anchor)
209+
210+
layer_id = last_same_stride_layer
211+
212+
return np.array(anchors)
213+
214+
def decode_boxes(self, raw_boxes):
215+
boxes = []
216+
num_boxes, _ = raw_boxes.shape
217+
218+
for i in range(num_boxes):
219+
anchor = self.anchors[i, :]
220+
y_center = raw_boxes[i, 1] if self.reverse_output_order else raw_boxes[i, 0]
221+
x_center = raw_boxes[i, 0] if self.reverse_output_order else raw_boxes[i, 1]
222+
h = raw_boxes[i, 3] if self.reverse_output_order else raw_boxes[i, 2]
223+
w = raw_boxes[i, 2] if self.reverse_output_order else raw_boxes[i, 3]
224+
225+
x_center = x_center / self.x_scale * anchor[2] + anchor[0]
226+
y_center = y_center / self.y_scale * anchor[3] + anchor[1]
227+
h = np.exp(h / self.h_scale) * anchor[3] if self.apply_exp_on_box_size else h / self.h_scale * anchor[3]
228+
w = np.exp(w / self.w_scale) * anchor[2] if self.apply_exp_on_box_size else w / self.w_scale * anchor[2]
229+
230+
decoded = [y_center - h / 2, x_center - w / 2, y_center + h / 2, x_center + w / 2]
231+
232+
for k in range(self.num_keypoints):
233+
offset = self.keypoint_coord_offset + k * self.num_values_per_keypoint
234+
keypoint_y = raw_boxes[i, offset + 1] if self.reverse_output_order else raw_boxes[i, offset]
235+
keypoint_x = raw_boxes[i, offset] if self.reverse_output_order else raw_boxes[i, offset + 1]
236+
237+
decoded.append(keypoint_x / self.x_scale * anchor[2] + anchor[0])
238+
decoded.append(keypoint_y / self.y_scale * anchor[3] + anchor[1])
239+
240+
boxes.append(decoded)
241+
242+
return boxes

0 commit comments

Comments
 (0)