|
| 1 | +/* |
| 2 | + * Copyright (C) 2024-2025 Intel Corporation |
| 3 | + * SPDX-License-Identifier: Apache-2.0 |
| 4 | + */ |
| 5 | + |
| 6 | +#include "tasks/keypoint_detection.h" |
| 7 | + |
| 8 | +#include "adapters/openvino_adapter.h" |
| 9 | +#include "utils/config.h" |
| 10 | +#include "utils/tensor.h" |
| 11 | + |
| 12 | +namespace { |
| 13 | + |
| 14 | +void colArgMax(const cv::Mat& src, |
| 15 | + cv::Mat& dst_locs, |
| 16 | + cv::Mat& dst_values, |
| 17 | + bool apply_softmax = false, |
| 18 | + float eps = 1e-6f) { |
| 19 | + dst_locs = cv::Mat::zeros(src.rows, 1, CV_32S); |
| 20 | + dst_values = cv::Mat::zeros(src.rows, 1, CV_32F); |
| 21 | + |
| 22 | + for (int row = 0; row < src.rows; ++row) { |
| 23 | + const float* ptr_row = src.ptr<float>(row); |
| 24 | + int max_val_idx = 0; |
| 25 | + float max_val = ptr_row[0]; |
| 26 | + for (int col = 1; col < src.cols; ++col) { |
| 27 | + if (ptr_row[col] > max_val) { |
| 28 | + max_val_idx = col; |
| 29 | + dst_locs.at<int>(row) = max_val_idx; |
| 30 | + max_val = ptr_row[col]; |
| 31 | + } |
| 32 | + } |
| 33 | + |
| 34 | + if (apply_softmax) { |
| 35 | + float sum = 0.0f; |
| 36 | + for (int col = 0; col < src.cols; ++col) { |
| 37 | + sum += exp(ptr_row[col] - max_val); |
| 38 | + } |
| 39 | + dst_values.at<float>(row) = exp(ptr_row[max_val_idx] - max_val) / (sum + eps); |
| 40 | + } else { |
| 41 | + dst_values.at<float>(row) = max_val; |
| 42 | + } |
| 43 | + } |
| 44 | +} |
| 45 | + |
| 46 | +KeypointDetectionResult decode_simcc(const cv::Mat& simcc_x, |
| 47 | + const cv::Mat& simcc_y, |
| 48 | + const cv::Point2f& extra_scale = cv::Point2f(1.f, 1.f), |
| 49 | + const cv::Point2i& extra_offset = cv::Point2f(0.f, 0.f), |
| 50 | + bool apply_softmax = false, |
| 51 | + float simcc_split_ratio = 2.0f, |
| 52 | + float decode_beta = 150.0f, |
| 53 | + float sigma = 6.0f) { |
| 54 | + cv::Mat x_locs, max_val_x; |
| 55 | + std::cout << cv::sum(simcc_x) << "\n"; |
| 56 | + std::cout << cv::sum(simcc_y) << "\n"; |
| 57 | + colArgMax(simcc_x, x_locs, max_val_x, false); |
| 58 | + |
| 59 | + cv::Mat y_locs, max_val_y; |
| 60 | + colArgMax(simcc_y, y_locs, max_val_y, false); |
| 61 | + |
| 62 | + if (apply_softmax) { |
| 63 | + cv::Mat tmp_locs; |
| 64 | + colArgMax(decode_beta * sigma * simcc_x, tmp_locs, max_val_x, true); |
| 65 | + colArgMax(decode_beta * sigma * simcc_y, tmp_locs, max_val_y, true); |
| 66 | + } |
| 67 | + |
| 68 | + std::vector<cv::Point2f> keypoints(x_locs.rows); |
| 69 | + cv::Mat scores = cv::Mat::zeros(x_locs.rows, 1, CV_32F); |
| 70 | + for (int i = 0; i < x_locs.rows; ++i) { |
| 71 | + keypoints[i] = cv::Point2f((x_locs.at<int>(i) - extra_offset.x) * extra_scale.x, |
| 72 | + (y_locs.at<int>(i) - extra_offset.y) * extra_scale.y) / |
| 73 | + simcc_split_ratio; |
| 74 | + scores.at<float>(i) = std::min(max_val_x.at<float>(i), max_val_y.at<float>(i)); |
| 75 | + |
| 76 | + if (scores.at<float>(i) <= 0.f) { |
| 77 | + keypoints[i] = cv::Point2f(-1.f, -1.f); |
| 78 | + } |
| 79 | + } |
| 80 | + |
| 81 | + return {std::move(keypoints), scores}; |
| 82 | +} |
| 83 | + |
| 84 | +} // namespace |
| 85 | + |
| 86 | +KeypointDetection KeypointDetection::create_model(const std::string& model_path, |
| 87 | + const ov::AnyMap& user_config, |
| 88 | + bool preload, |
| 89 | + const std::string& device) { |
| 90 | + auto adapter = std::make_shared<OpenVINOInferenceAdapter>(); |
| 91 | + adapter->loadModel(model_path, device, user_config, false); |
| 92 | + |
| 93 | + std::string model_type; |
| 94 | + model_type = utils::get_from_any_maps("model_type", user_config, adapter->getModelConfig(), model_type); |
| 95 | + |
| 96 | + if (model_type.empty() || model_type != "keypoint_detection") { |
| 97 | + throw std::runtime_error("Incorrect or unsupported model_type, expected: keypoint_detection"); |
| 98 | + } |
| 99 | + adapter->applyModelTransform(KeypointDetection::serialize); |
| 100 | + if (preload) { |
| 101 | + adapter->compileModel(device, user_config); |
| 102 | + } |
| 103 | + |
| 104 | + return KeypointDetection(adapter, user_config); |
| 105 | +} |
| 106 | + |
| 107 | +void KeypointDetection::serialize(std::shared_ptr<ov::Model>& ov_model) { |
| 108 | + if (utils::model_has_embedded_processing(ov_model)) { |
| 109 | + std::cout << "model already was serialized" << std::endl; |
| 110 | + return; |
| 111 | + } |
| 112 | + if (ov_model->inputs().size() != 1) { |
| 113 | + throw std::logic_error("KeypointDetection model wrapper supports topologies with only 1 input"); |
| 114 | + } |
| 115 | + const auto& input = ov_model->input(); |
| 116 | + auto config = ov_model->has_rt_info("model_info") ? ov_model->get_rt_info<ov::AnyMap>("model_info") : ov::AnyMap{}; |
| 117 | + std::string layout = ""; |
| 118 | + layout = utils::get_from_any_maps("layout", config, {}, layout); |
| 119 | + auto inputsLayouts = utils::parseLayoutString(layout); |
| 120 | + const ov::Layout& inputLayout = utils::getInputLayout(input, inputsLayouts); |
| 121 | + const ov::Shape& inputShape = input.get_partial_shape().get_max_shape(); |
| 122 | + if (inputShape.size() != 4 || inputShape[ov::layout::channels_idx(inputLayout)] != 3) { |
| 123 | + throw std::logic_error("3-channel 4-dimensional model's input is expected"); |
| 124 | + } |
| 125 | + |
| 126 | + auto interpolation_mode = cv::INTER_LINEAR; |
| 127 | + utils::RESIZE_MODE resize_mode = utils::RESIZE_MODE::RESIZE_FILL; |
| 128 | + resize_mode = utils::get_from_any_maps("resize_type", config, ov::AnyMap{}, resize_mode); |
| 129 | + |
| 130 | + std::vector<float> scale_values; |
| 131 | + std::vector<float> mean_values; |
| 132 | + scale_values = utils::get_from_any_maps("scale_values", config, ov::AnyMap{}, scale_values); |
| 133 | + mean_values = utils::get_from_any_maps("mean_values", config, ov::AnyMap{}, mean_values); |
| 134 | + uint8_t pad_value = 0; |
| 135 | + pad_value = utils::get_from_any_maps<unsigned>("pad_value", config, ov::AnyMap{}, pad_value); |
| 136 | + bool reverse_input_channels = false; |
| 137 | + reverse_input_channels = |
| 138 | + utils::get_from_any_maps("reverse_input_channels", config, ov::AnyMap{}, reverse_input_channels); |
| 139 | + |
| 140 | + cv::Size input_shape(inputShape[ov::layout::width_idx(inputLayout)], |
| 141 | + inputShape[ov::layout::height_idx(inputLayout)]); |
| 142 | + |
| 143 | + ov_model = utils::embedProcessing( |
| 144 | + ov_model, |
| 145 | + input.get_any_name(), |
| 146 | + inputLayout, |
| 147 | + resize_mode, |
| 148 | + interpolation_mode, |
| 149 | + ov::Shape{static_cast<size_t>(input_shape.width), static_cast<size_t>(input_shape.height)}, |
| 150 | + pad_value, |
| 151 | + reverse_input_channels, |
| 152 | + mean_values, |
| 153 | + scale_values); |
| 154 | + |
| 155 | + // --------------------------- Check output ----------------------------------------------------- |
| 156 | + |
| 157 | + if (ov_model->outputs().size() != 2) { |
| 158 | + throw std::logic_error(std::string{"KeypointDetection model wrapper supports topologies with 2 outputs"}); |
| 159 | + } |
| 160 | + |
| 161 | + ov_model->set_rt_info(true, "model_info", "embedded_processing"); |
| 162 | + ov_model->set_rt_info(input_shape.width, "model_info", "orig_width"); |
| 163 | + ov_model->set_rt_info(input_shape.height, "model_info", "orig_height"); |
| 164 | +} |
| 165 | + |
| 166 | +std::map<std::string, ov::Tensor> KeypointDetection::preprocess(cv::Mat image) { |
| 167 | + std::map<std::string, ov::Tensor> input = {}; |
| 168 | + input.emplace(adapter->getInputNames()[0], utils::wrapMat2Tensor(image)); |
| 169 | + return input; |
| 170 | +} |
| 171 | + |
| 172 | +KeypointDetectionResult KeypointDetection::postprocess(InferenceResult& infResult) { |
| 173 | + auto outputNames = adapter->getOutputNames(); |
| 174 | + |
| 175 | + const ov::Tensor& pred_x_tensor = infResult.data.find(outputNames[0])->second; |
| 176 | + size_t shape_offset = pred_x_tensor.get_shape().size() == 3 ? 1 : 0; |
| 177 | + auto pred_x_mat = cv::Mat(cv::Size(static_cast<int>(pred_x_tensor.get_shape()[shape_offset + 1]), |
| 178 | + static_cast<int>(pred_x_tensor.get_shape()[shape_offset])), |
| 179 | + CV_32F, |
| 180 | + pred_x_tensor.data(), |
| 181 | + pred_x_tensor.get_strides()[shape_offset]); |
| 182 | + |
| 183 | + const ov::Tensor& pred_y_tensor = infResult.data.find(outputNames[1])->second; |
| 184 | + shape_offset = pred_y_tensor.get_shape().size() == 3 ? 1 : 0; |
| 185 | + auto pred_y_mat = cv::Mat(cv::Size(static_cast<int>(pred_y_tensor.get_shape()[shape_offset + 1]), |
| 186 | + static_cast<int>(pred_y_tensor.get_shape()[shape_offset])), |
| 187 | + CV_32F, |
| 188 | + pred_y_tensor.data(), |
| 189 | + pred_y_tensor.get_strides()[shape_offset]); |
| 190 | + |
| 191 | + float inverted_scale_x = static_cast<float>(infResult.inputImageSize.width) / input_shape.width, |
| 192 | + inverted_scale_y = static_cast<float>(infResult.inputImageSize.height) / input_shape.height; |
| 193 | + |
| 194 | + int pad_left = 0, pad_top = 0; |
| 195 | + if (utils::RESIZE_MODE::RESIZE_KEEP_ASPECT == resize_mode || |
| 196 | + utils::RESIZE_MODE::RESIZE_KEEP_ASPECT_LETTERBOX == resize_mode) { |
| 197 | + inverted_scale_x = inverted_scale_y = std::max(inverted_scale_x, inverted_scale_y); |
| 198 | + if (utils::RESIZE_MODE::RESIZE_KEEP_ASPECT_LETTERBOX == resize_mode) { |
| 199 | + pad_left = |
| 200 | + (input_shape.width - |
| 201 | + static_cast<int>(std::round(static_cast<float>(infResult.inputImageSize.width) / inverted_scale_x))) / |
| 202 | + 2; |
| 203 | + pad_top = |
| 204 | + (input_shape.height - |
| 205 | + static_cast<int>(std::round(static_cast<float>(infResult.inputImageSize.height) / inverted_scale_y))) / |
| 206 | + 2; |
| 207 | + } |
| 208 | + } |
| 209 | + |
| 210 | + return decode_simcc(pred_x_mat, |
| 211 | + pred_y_mat, |
| 212 | + {inverted_scale_x, inverted_scale_y}, |
| 213 | + {pad_left, pad_top}, |
| 214 | + apply_softmax); |
| 215 | +} |
| 216 | + |
| 217 | +KeypointDetectionResult KeypointDetection::infer(cv::Mat image) { |
| 218 | + return pipeline.infer(image); |
| 219 | +} |
| 220 | + |
| 221 | +std::vector<KeypointDetectionResult> KeypointDetection::inferBatch(std::vector<cv::Mat> images) { |
| 222 | + return pipeline.inferBatch(images); |
| 223 | +} |
0 commit comments