diff --git a/src/cpp/models/include/models/keypoint_detection.h b/src/cpp/models/include/models/keypoint_detection.h index 6e95731a..63e4e50f 100644 --- a/src/cpp/models/include/models/keypoint_detection.h +++ b/src/cpp/models/include/models/keypoint_detection.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2024 Intel Corporation + * Copyright (C) 2020-2025 Intel Corporation * SPDX-License-Identifier: Apache-2.0 */ @@ -38,6 +38,8 @@ class KeypointDetectionModel : public ImageModel { static std::string ModelType; protected: + bool apply_softmax = true; + void prepareInputsOutputs(std::shared_ptr& model) override; void updateModelInfo() override; void init_from_config(const ov::AnyMap& top_priority, const ov::AnyMap& mid_priority); diff --git a/src/cpp/models/include/models/results.h b/src/cpp/models/include/models/results.h index 9468043c..710d6ab8 100644 --- a/src/cpp/models/include/models/results.h +++ b/src/cpp/models/include/models/results.h @@ -6,6 +6,7 @@ #pragma once #include #include +#include #include #include #include @@ -356,8 +357,11 @@ struct DetectedKeypoints { for (const cv::Point2f& keypoint : prediction.keypoints) { kp_x_sum += keypoint.x; } + float scores_sum = std::accumulate(prediction.scores.begin(), prediction.scores.end(), 0.f); + os << "keypoints: (" << prediction.keypoints.size() << ", 2), keypoints_x_sum: "; - os << std::fixed << std::setprecision(3) << kp_x_sum << ", scores: (" << prediction.scores.size() << ",)"; + os << std::fixed << std::setprecision(3) << kp_x_sum << ", scores: (" << prediction.scores.size() << ",) " + << std::fixed << std::setprecision(3) << scores_sum; return os; } diff --git a/src/cpp/models/src/keypoint_detection.cpp b/src/cpp/models/src/keypoint_detection.cpp index f1ba35ed..a1c78112 100644 --- a/src/cpp/models/src/keypoint_detection.cpp +++ b/src/cpp/models/src/keypoint_detection.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2024 Intel Corporation + * Copyright (C) 2020-2025 Intel Corporation * SPDX-License-Identifier: Apache-2.0 */ @@ -18,33 +18,48 @@ namespace { -void colArgMax(const cv::Mat& src, cv::Mat& dst_locs, cv::Mat& dst_values) { +void colArgMax(const cv::Mat& src, + cv::Mat& dst_locs, + cv::Mat& dst_values, + bool apply_softmax = false, + float eps = 1e-6f) { dst_locs = cv::Mat::zeros(src.rows, 1, CV_32S); dst_values = cv::Mat::zeros(src.rows, 1, CV_32F); - for (int row = 0; row < src.rows; row++) { + for (int row = 0; row < src.rows; ++row) { const float* ptr_row = src.ptr(row); int max_val_idx = 0; - dst_values.at(row) = ptr_row[max_val_idx]; + float max_val = ptr_row[0]; for (int col = 1; col < src.cols; ++col) { - if (ptr_row[col] > ptr_row[max_val_idx]) { + if (ptr_row[col] > max_val) { max_val_idx = col; dst_locs.at(row) = max_val_idx; - dst_values.at(row) = ptr_row[col]; + max_val = ptr_row[col]; } } + + if (apply_softmax) { + float sum = 0.0f; + for (int col = 0; col < src.cols; ++col) { + sum += exp(ptr_row[col] - max_val); + } + dst_values.at(row) = exp(ptr_row[max_val_idx] - max_val) / (sum + eps); + } else { + dst_values.at(row) = max_val; + } } } DetectedKeypoints decode_simcc(const cv::Mat& simcc_x, const cv::Mat& simcc_y, const cv::Point2f& extra_scale = cv::Point2f(1.f, 1.f), + bool apply_softmax = false, float simcc_split_ratio = 2.0f) { cv::Mat x_locs, max_val_x; - colArgMax(simcc_x, x_locs, max_val_x); + colArgMax(simcc_x, x_locs, max_val_x, apply_softmax); cv::Mat y_locs, max_val_y; - colArgMax(simcc_y, y_locs, max_val_y); + colArgMax(simcc_y, y_locs, max_val_y, apply_softmax); std::vector keypoints(x_locs.rows); cv::Mat scores = cv::Mat::zeros(x_locs.rows, 1, CV_32F); @@ -67,6 +82,7 @@ std::string KeypointDetectionModel::ModelType = "keypoint_detection"; void KeypointDetectionModel::init_from_config(const ov::AnyMap& top_priority, const ov::AnyMap& mid_priority) { labels = get_from_any_maps("labels", top_priority, mid_priority, labels); + apply_softmax = get_from_any_maps("apply_softmax", top_priority, mid_priority, apply_softmax); } KeypointDetectionModel::KeypointDetectionModel(std::shared_ptr& model, const ov::AnyMap& configuration) @@ -204,7 +220,8 @@ std::unique_ptr KeypointDetectionModel::postprocess(InferenceResult& float inverted_scale_x = static_cast(image_data.inputImgWidth) / netInputWidth, inverted_scale_y = static_cast(image_data.inputImgHeight) / netInputHeight; - result->poses.emplace_back(decode_simcc(pred_x_mat, pred_y_mat, {inverted_scale_x, inverted_scale_y})); + result->poses.emplace_back( + decode_simcc(pred_x_mat, pred_y_mat, {inverted_scale_x, inverted_scale_y}, apply_softmax)); return std::unique_ptr(result); } diff --git a/src/python/model_api/models/keypoint_detection.py b/src/python/model_api/models/keypoint_detection.py index 5ea9ef1d..9e9b2fb5 100644 --- a/src/python/model_api/models/keypoint_detection.py +++ b/src/python/model_api/models/keypoint_detection.py @@ -1,5 +1,5 @@ # -# Copyright (C) 2020-2024 Intel Corporation +# Copyright (C) 2020-2025 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # @@ -11,7 +11,7 @@ from .image_model import ImageModel from .result import DetectedKeypoints, DetectionResult -from .types import ListValue +from .types import BooleanValue, ListValue class KeypointDetectionModel(ImageModel): @@ -30,6 +30,7 @@ def __init__(self, inference_adapter, configuration: dict = {}, preload=False): """ super().__init__(inference_adapter, configuration, preload) self._check_io_number(1, 2) + self.apply_softmax: bool def postprocess( self, @@ -46,7 +47,11 @@ def postprocess( DetectedKeypoints: detected keypoints """ encoded_kps = list(outputs.values()) - batch_keypoints, batch_scores = _decode_simcc(*encoded_kps) + batch_keypoints, batch_scores = _decode_simcc( + encoded_kps[0], + encoded_kps[1], + apply_softmax=self.apply_softmax, + ) orig_h, orig_w = meta["original_shape"][:2] kp_scale_h = orig_h / self.h kp_scale_w = orig_w / self.w @@ -63,6 +68,10 @@ def parameters(cls) -> dict: value_type=str, default_value=[], ), + "apply_softmax": BooleanValue( + default_value=True, + description="Whether to apply softmax on the heatmap.", + ), }, ) return parameters @@ -119,6 +128,7 @@ def _decode_simcc( simcc_x: np.ndarray, simcc_y: np.ndarray, simcc_split_ratio: float = 2.0, + apply_softmax: bool = False, ) -> tuple[np.ndarray, np.ndarray]: """Decodes keypoint coordinates from SimCC representations. The decoded coordinates are in the input image space. @@ -126,6 +136,8 @@ def _decode_simcc( simcc_x (np.ndarray): SimCC label for x-axis simcc_y (np.ndarray): SimCC label for y-axis simcc_split_ratio (float): The ratio of the label size to the input size. + apply_softmax (bool): whether to apply softmax on the heatmap. + Defaults to False. Returns: tuple: @@ -133,7 +145,7 @@ def _decode_simcc( - scores (np.ndarray): The keypoint scores in shape (N, K). It usually represents the confidence of the keypoint prediction """ - keypoints, scores = _get_simcc_maximum(simcc_x, simcc_y) + keypoints, scores = _get_simcc_maximum(simcc_x, simcc_y, apply_softmax) # Unsqueeze the instance dimension for single-instance results if keypoints.ndim == 2: @@ -148,6 +160,8 @@ def _decode_simcc( def _get_simcc_maximum( simcc_x: np.ndarray, simcc_y: np.ndarray, + apply_softmax: bool = False, + softmax_eps: float = 1e-06, ) -> tuple[np.ndarray, np.ndarray]: """Get maximum response location and value from simcc representations. @@ -160,6 +174,10 @@ def _get_simcc_maximum( Args: simcc_x (np.ndarray): x-axis SimCC in shape (K, Wx) or (N, K, Wx) simcc_y (np.ndarray): y-axis SimCC in shape (K, Hy) or (N, K, Hy) + apply_softmax (bool): whether to apply softmax on the heatmap. + Defaults to False. + softmax_eps (flat): a constant to avoid division by zero in softmax. + Defaults to 1e-6. Returns: tuple: @@ -185,6 +203,13 @@ def _get_simcc_maximum( else: batch_size = None + if apply_softmax: + simcc_x = simcc_x - np.max(simcc_x, axis=1, keepdims=True) + simcc_y = simcc_y - np.max(simcc_y, axis=1, keepdims=True) + ex, ey = np.exp(simcc_x), np.exp(simcc_y) + simcc_x = ex / (np.sum(ex, axis=1, keepdims=True) + softmax_eps) + simcc_y = ey / (np.sum(ey, axis=1, keepdims=True) + softmax_eps) + x_locs = np.argmax(simcc_x, axis=1) y_locs = np.argmax(simcc_y, axis=1) locs = np.stack((x_locs, y_locs), axis=-1).astype(np.float32) diff --git a/src/python/model_api/models/result/keypoint.py b/src/python/model_api/models/result/keypoint.py index 8c79c585..09d31068 100644 --- a/src/python/model_api/models/result/keypoint.py +++ b/src/python/model_api/models/result/keypoint.py @@ -17,5 +17,5 @@ def __str__(self): return ( f"keypoints: {self.keypoints.shape}, " f"keypoints_x_sum: {np.sum(self.keypoints[:, :1]):.3f}, " - f"scores: {self.scores.shape}" + f"scores: {self.scores.shape} {np.sum(self.scores):.3f}" ) diff --git a/tests/python/accuracy/public_scope.json b/tests/python/accuracy/public_scope.json index 361781a0..8dad6ec8 100644 --- a/tests/python/accuracy/public_scope.json +++ b/tests/python/accuracy/public_scope.json @@ -425,7 +425,7 @@ { "image": "coco128/images/train2017/000000000471.jpg", "reference": [ - "keypoints: (17, 2), keypoints_x_sum: 5700.000, scores: (17,)" + "keypoints: (17, 2), keypoints_x_sum: 5700.000, scores: (17,) 0.049" ] } ]