Skip to content

Commit 7043036

Browse files
committed
Add keypoint detection
1 parent 58a8d0a commit 7043036

File tree

5 files changed

+347
-0
lines changed

5 files changed

+347
-0
lines changed
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
/*
2+
* Copyright (C) 2024-2025 Intel Corporation
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
#pragma once
7+
#include <opencv2/opencv.hpp>
8+
#include <openvino/openvino.hpp>
9+
10+
#include "adapters/inference_adapter.h"
11+
#include "tasks/results.h"
12+
#include "utils/config.h"
13+
#include "utils/preprocessing.h"
14+
#include "utils/vision_pipeline.h"
15+
16+
class KeypointDetection {
17+
public:
18+
VisionPipeline<KeypointDetectionResult> pipeline;
19+
std::shared_ptr<InferenceAdapter> adapter;
20+
KeypointDetection(std::shared_ptr<InferenceAdapter> adapter, const ov::AnyMap& user_config) : adapter(adapter) {
21+
pipeline = VisionPipeline<KeypointDetectionResult>(
22+
adapter,
23+
[&](cv::Mat image) {
24+
return preprocess(image);
25+
},
26+
[&](InferenceResult result) {
27+
return postprocess(result);
28+
});
29+
30+
auto model_config = adapter->getModelConfig();
31+
labels = utils::get_from_any_maps("labels", user_config, model_config, labels);
32+
apply_softmax = utils::get_from_any_maps("apply_softmax", user_config, model_config, apply_softmax);
33+
34+
input_shape.width = utils::get_from_any_maps("orig_width", user_config, model_config, input_shape.width);
35+
input_shape.height = utils::get_from_any_maps("orig_height", user_config, model_config, input_shape.width);
36+
resize_mode = utils::get_from_any_maps("resize_type", user_config, model_config, resize_mode);
37+
}
38+
39+
static void serialize(std::shared_ptr<ov::Model>& ov_model);
40+
static KeypointDetection create_model(const std::string& model_path,
41+
const ov::AnyMap& user_config = {},
42+
bool preload = true,
43+
const std::string& device = "AUTO");
44+
45+
std::map<std::string, ov::Tensor> preprocess(cv::Mat);
46+
KeypointDetectionResult postprocess(InferenceResult& infResult);
47+
48+
KeypointDetectionResult infer(cv::Mat image);
49+
std::vector<KeypointDetectionResult> inferBatch(std::vector<cv::Mat> image);
50+
51+
private:
52+
cv::Size input_shape;
53+
bool apply_softmax = true;
54+
utils::RESIZE_MODE resize_mode = utils::RESIZE_MODE::RESIZE_FILL;
55+
std::vector<std::string> labels;
56+
};

src/cpp/include/tasks/results.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,3 +227,27 @@ struct AnomalyResult {
227227
return ss.str();
228228
}
229229
};
230+
231+
struct KeypointDetectionResult {
232+
std::vector<cv::Point2f> keypoints;
233+
std::vector<float> scores;
234+
235+
friend std::ostream& operator<<(std::ostream& os, const KeypointDetectionResult& prediction) {
236+
float kp_x_sum = 0.f;
237+
for (const cv::Point2f& keypoint : prediction.keypoints) {
238+
kp_x_sum += keypoint.x;
239+
}
240+
float scores_sum = std::accumulate(prediction.scores.begin(), prediction.scores.end(), 0.f);
241+
242+
os << "keypoints: (" << prediction.keypoints.size() << ", 2), keypoints_x_sum: ";
243+
os << std::fixed << std::setprecision(3) << kp_x_sum << ", scores: (" << prediction.scores.size() << ",) "
244+
<< std::fixed << std::setprecision(3) << scores_sum;
245+
return os;
246+
}
247+
248+
explicit operator std::string() {
249+
std::stringstream ss;
250+
ss << *this;
251+
return ss.str();
252+
}
253+
};
Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
/*
2+
* Copyright (C) 2024-2025 Intel Corporation
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
#include "tasks/keypoint_detection.h"
7+
8+
#include "adapters/openvino_adapter.h"
9+
#include "utils/config.h"
10+
#include "utils/tensor.h"
11+
12+
namespace {
13+
14+
void colArgMax(const cv::Mat& src,
15+
cv::Mat& dst_locs,
16+
cv::Mat& dst_values,
17+
bool apply_softmax = false,
18+
float eps = 1e-6f) {
19+
dst_locs = cv::Mat::zeros(src.rows, 1, CV_32S);
20+
dst_values = cv::Mat::zeros(src.rows, 1, CV_32F);
21+
22+
for (int row = 0; row < src.rows; ++row) {
23+
const float* ptr_row = src.ptr<float>(row);
24+
int max_val_idx = 0;
25+
float max_val = ptr_row[0];
26+
for (int col = 1; col < src.cols; ++col) {
27+
if (ptr_row[col] > max_val) {
28+
max_val_idx = col;
29+
dst_locs.at<int>(row) = max_val_idx;
30+
max_val = ptr_row[col];
31+
}
32+
}
33+
34+
if (apply_softmax) {
35+
float sum = 0.0f;
36+
for (int col = 0; col < src.cols; ++col) {
37+
sum += exp(ptr_row[col] - max_val);
38+
}
39+
dst_values.at<float>(row) = exp(ptr_row[max_val_idx] - max_val) / (sum + eps);
40+
} else {
41+
dst_values.at<float>(row) = max_val;
42+
}
43+
}
44+
}
45+
46+
KeypointDetectionResult decode_simcc(const cv::Mat& simcc_x,
47+
const cv::Mat& simcc_y,
48+
const cv::Point2f& extra_scale = cv::Point2f(1.f, 1.f),
49+
const cv::Point2i& extra_offset = cv::Point2f(0.f, 0.f),
50+
bool apply_softmax = false,
51+
float simcc_split_ratio = 2.0f,
52+
float decode_beta = 150.0f,
53+
float sigma = 6.0f) {
54+
cv::Mat x_locs, max_val_x;
55+
std::cout << cv::sum(simcc_x) << "\n";
56+
std::cout << cv::sum(simcc_y) << "\n";
57+
colArgMax(simcc_x, x_locs, max_val_x, false);
58+
59+
cv::Mat y_locs, max_val_y;
60+
colArgMax(simcc_y, y_locs, max_val_y, false);
61+
62+
if (apply_softmax) {
63+
cv::Mat tmp_locs;
64+
colArgMax(decode_beta * sigma * simcc_x, tmp_locs, max_val_x, true);
65+
colArgMax(decode_beta * sigma * simcc_y, tmp_locs, max_val_y, true);
66+
}
67+
68+
std::vector<cv::Point2f> keypoints(x_locs.rows);
69+
cv::Mat scores = cv::Mat::zeros(x_locs.rows, 1, CV_32F);
70+
for (int i = 0; i < x_locs.rows; ++i) {
71+
keypoints[i] = cv::Point2f((x_locs.at<int>(i) - extra_offset.x) * extra_scale.x,
72+
(y_locs.at<int>(i) - extra_offset.y) * extra_scale.y) /
73+
simcc_split_ratio;
74+
scores.at<float>(i) = std::min(max_val_x.at<float>(i), max_val_y.at<float>(i));
75+
76+
if (scores.at<float>(i) <= 0.f) {
77+
keypoints[i] = cv::Point2f(-1.f, -1.f);
78+
}
79+
}
80+
81+
return {std::move(keypoints), scores};
82+
}
83+
84+
} // namespace
85+
86+
KeypointDetection KeypointDetection::create_model(const std::string& model_path,
87+
const ov::AnyMap& user_config,
88+
bool preload,
89+
const std::string& device) {
90+
auto adapter = std::make_shared<OpenVINOInferenceAdapter>();
91+
adapter->loadModel(model_path, device, user_config, false);
92+
93+
std::string model_type;
94+
model_type = utils::get_from_any_maps("model_type", user_config, adapter->getModelConfig(), model_type);
95+
96+
if (model_type.empty() || model_type != "keypoint_detection") {
97+
throw std::runtime_error("Incorrect or unsupported model_type, expected: keypoint_detection");
98+
}
99+
adapter->applyModelTransform(KeypointDetection::serialize);
100+
if (preload) {
101+
adapter->compileModel(device, user_config);
102+
}
103+
104+
return KeypointDetection(adapter, user_config);
105+
}
106+
107+
void KeypointDetection::serialize(std::shared_ptr<ov::Model>& ov_model) {
108+
if (utils::model_has_embedded_processing(ov_model)) {
109+
std::cout << "model already was serialized" << std::endl;
110+
return;
111+
}
112+
if (ov_model->inputs().size() != 1) {
113+
throw std::logic_error("KeypointDetection model wrapper supports topologies with only 1 input");
114+
}
115+
const auto& input = ov_model->input();
116+
auto config = ov_model->has_rt_info("model_info") ? ov_model->get_rt_info<ov::AnyMap>("model_info") : ov::AnyMap{};
117+
std::string layout = "";
118+
layout = utils::get_from_any_maps("layout", config, {}, layout);
119+
auto inputsLayouts = utils::parseLayoutString(layout);
120+
const ov::Layout& inputLayout = utils::getInputLayout(input, inputsLayouts);
121+
const ov::Shape& inputShape = input.get_partial_shape().get_max_shape();
122+
if (inputShape.size() != 4 || inputShape[ov::layout::channels_idx(inputLayout)] != 3) {
123+
throw std::logic_error("3-channel 4-dimensional model's input is expected");
124+
}
125+
126+
auto interpolation_mode = cv::INTER_LINEAR;
127+
utils::RESIZE_MODE resize_mode = utils::RESIZE_MODE::RESIZE_FILL;
128+
resize_mode = utils::get_from_any_maps("resize_type", config, ov::AnyMap{}, resize_mode);
129+
130+
std::vector<float> scale_values;
131+
std::vector<float> mean_values;
132+
scale_values = utils::get_from_any_maps("scale_values", config, ov::AnyMap{}, scale_values);
133+
mean_values = utils::get_from_any_maps("mean_values", config, ov::AnyMap{}, mean_values);
134+
uint8_t pad_value = 0;
135+
pad_value = utils::get_from_any_maps<unsigned>("pad_value", config, ov::AnyMap{}, pad_value);
136+
bool reverse_input_channels = false;
137+
reverse_input_channels =
138+
utils::get_from_any_maps("reverse_input_channels", config, ov::AnyMap{}, reverse_input_channels);
139+
140+
cv::Size input_shape(inputShape[ov::layout::width_idx(inputLayout)],
141+
inputShape[ov::layout::height_idx(inputLayout)]);
142+
143+
ov_model = utils::embedProcessing(
144+
ov_model,
145+
input.get_any_name(),
146+
inputLayout,
147+
resize_mode,
148+
interpolation_mode,
149+
ov::Shape{static_cast<size_t>(input_shape.width), static_cast<size_t>(input_shape.height)},
150+
pad_value,
151+
reverse_input_channels,
152+
mean_values,
153+
scale_values);
154+
155+
// --------------------------- Check output -----------------------------------------------------
156+
157+
if (ov_model->outputs().size() != 2) {
158+
throw std::logic_error(std::string{"KeypointDetection model wrapper supports topologies with 2 outputs"});
159+
}
160+
161+
ov_model->set_rt_info(true, "model_info", "embedded_processing");
162+
ov_model->set_rt_info(input_shape.width, "model_info", "orig_width");
163+
ov_model->set_rt_info(input_shape.height, "model_info", "orig_height");
164+
}
165+
166+
std::map<std::string, ov::Tensor> KeypointDetection::preprocess(cv::Mat image) {
167+
std::map<std::string, ov::Tensor> input = {};
168+
input.emplace(adapter->getInputNames()[0], utils::wrapMat2Tensor(image));
169+
return input;
170+
}
171+
172+
KeypointDetectionResult KeypointDetection::postprocess(InferenceResult& infResult) {
173+
auto outputNames = adapter->getOutputNames();
174+
175+
const ov::Tensor& pred_x_tensor = infResult.data.find(outputNames[0])->second;
176+
size_t shape_offset = pred_x_tensor.get_shape().size() == 3 ? 1 : 0;
177+
auto pred_x_mat = cv::Mat(cv::Size(static_cast<int>(pred_x_tensor.get_shape()[shape_offset + 1]),
178+
static_cast<int>(pred_x_tensor.get_shape()[shape_offset])),
179+
CV_32F,
180+
pred_x_tensor.data(),
181+
pred_x_tensor.get_strides()[shape_offset]);
182+
183+
const ov::Tensor& pred_y_tensor = infResult.data.find(outputNames[1])->second;
184+
shape_offset = pred_y_tensor.get_shape().size() == 3 ? 1 : 0;
185+
auto pred_y_mat = cv::Mat(cv::Size(static_cast<int>(pred_y_tensor.get_shape()[shape_offset + 1]),
186+
static_cast<int>(pred_y_tensor.get_shape()[shape_offset])),
187+
CV_32F,
188+
pred_y_tensor.data(),
189+
pred_y_tensor.get_strides()[shape_offset]);
190+
191+
float inverted_scale_x = static_cast<float>(infResult.inputImageSize.width) / input_shape.width,
192+
inverted_scale_y = static_cast<float>(infResult.inputImageSize.height) / input_shape.height;
193+
194+
int pad_left = 0, pad_top = 0;
195+
if (utils::RESIZE_MODE::RESIZE_KEEP_ASPECT == resize_mode ||
196+
utils::RESIZE_MODE::RESIZE_KEEP_ASPECT_LETTERBOX == resize_mode) {
197+
inverted_scale_x = inverted_scale_y = std::max(inverted_scale_x, inverted_scale_y);
198+
if (utils::RESIZE_MODE::RESIZE_KEEP_ASPECT_LETTERBOX == resize_mode) {
199+
pad_left =
200+
(input_shape.width -
201+
static_cast<int>(std::round(static_cast<float>(infResult.inputImageSize.width) / inverted_scale_x))) /
202+
2;
203+
pad_top =
204+
(input_shape.height -
205+
static_cast<int>(std::round(static_cast<float>(infResult.inputImageSize.height) / inverted_scale_y))) /
206+
2;
207+
}
208+
}
209+
210+
return decode_simcc(pred_x_mat,
211+
pred_y_mat,
212+
{inverted_scale_x, inverted_scale_y},
213+
{pad_left, pad_top},
214+
apply_softmax);
215+
}
216+
217+
KeypointDetectionResult KeypointDetection::infer(cv::Mat image) {
218+
return pipeline.infer(image);
219+
}
220+
221+
std::vector<KeypointDetectionResult> KeypointDetection::inferBatch(std::vector<cv::Mat> images) {
222+
return pipeline.inferBatch(images);
223+
}

tests/cpp/test_accuracy.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include "tasks/classification.h"
1212
#include "tasks/detection.h"
1313
#include "tasks/instance_segmentation.h"
14+
#include "tasks/keypoint_detection.h"
1415
#include "tasks/semantic_segmentation.h"
1516

1617
std::string PUBLIC_SCOPE_PATH = "../public_scope.json";
@@ -124,11 +125,22 @@ TEST_P(ModelParameterizedTest, AccuracyTest) {
124125
std::string image_path = DATA_DIR + '/' + test_data.image;
125126
cv::Mat image = cv::imread(image_path);
126127
auto result = model.infer(image);
128+
127129
EXPECT_EQ(std::string{result}, test_data.reference[0]);
128130
}
129131
} else if (data.type == "AnomalyDetection") {
130132
auto model = Anomaly::create_model(model_path);
131133

134+
for (auto& test_data : data.test_data) {
135+
std::string image_path = DATA_DIR + '/' + test_data.image;
136+
cv::Mat image = cv::imread(image_path);
137+
auto result = model.infer(image);
138+
139+
EXPECT_EQ(std::string{result}, test_data.reference[0]);
140+
}
141+
} else if (data.type == "KeypointDetectionModel") {
142+
auto model = KeypointDetection::create_model(model_path);
143+
132144
for (auto& test_data : data.test_data) {
133145
std::string image_path = DATA_DIR + '/' + test_data.image;
134146
cv::Mat image = cv::imread(image_path);
@@ -199,6 +211,15 @@ TEST_P(ModelParameterizedTest, SerializedAccuracyTest) {
199211
cv::Mat image = cv::imread(image_path);
200212
auto result = model.infer(image);
201213

214+
EXPECT_EQ(std::string{result}, test_data.reference[0]);
215+
}
216+
} else if (data.type == "KeypointDetectionModel") {
217+
auto model = KeypointDetection::create_model(model_path);
218+
for (auto& test_data : data.test_data) {
219+
std::string image_path = DATA_DIR + '/' + test_data.image;
220+
cv::Mat image = cv::imread(image_path);
221+
auto result = model.infer(image);
222+
202223
EXPECT_EQ(std::string{result}, test_data.reference[0]);
203224
}
204225
} else {
@@ -260,6 +281,17 @@ TEST_P(ModelParameterizedTest, AccuracyTestBatch) {
260281
} else if (data.type == "AnomalyDetection") {
261282
auto model = Anomaly::create_model(model_path);
262283

284+
for (auto& test_data : data.test_data) {
285+
std::string image_path = DATA_DIR + '/' + test_data.image;
286+
cv::Mat image = cv::imread(image_path);
287+
auto result = model.inferBatch({image});
288+
289+
ASSERT_EQ(result.size(), 1);
290+
EXPECT_EQ(std::string{result[0]}, test_data.reference[0]);
291+
}
292+
} else if (data.type == "KeypointDetectionModel") {
293+
auto model = KeypointDetection::create_model(model_path);
294+
263295
for (auto& test_data : data.test_data) {
264296
std::string image_path = DATA_DIR + '/' + test_data.image;
265297
cv::Mat image = cv::imread(image_path);

tests/python/accuracy/public_scope.json

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,5 +255,17 @@
255255
]
256256
}
257257
]
258+
},
259+
{
260+
"name": "otx_models/rtmpose_tiny.xml",
261+
"type": "KeypointDetectionModel",
262+
"test_data": [
263+
{
264+
"image": "coco128/images/train2017/000000000471.jpg",
265+
"reference": [
266+
"keypoints: (17, 2), keypoints_x_sum: 2930.000, scores: (17,) 14.061"
267+
]
268+
}
269+
]
258270
}
259271
]

0 commit comments

Comments
 (0)