diff --git a/model_api/cpp/models/src/associative_embedding_decoder.cpp b/model_api/cpp/models/src/associative_embedding_decoder.cpp deleted file mode 100644 index dfb4f916..00000000 --- a/model_api/cpp/models/src/associative_embedding_decoder.cpp +++ /dev/null @@ -1,201 +0,0 @@ -/* -// Copyright (C) 2021-2024 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "models/associative_embedding_decoder.h" - -#include -#include -#include -#include -#include - -#include - -void findPeaks(const std::vector& nmsHeatMaps, - const std::vector& aembdsMaps, - std::vector>& allPeaks, - size_t jointId, - size_t maxNumPeople, - float detectionThreshold) { - const cv::Mat& nmsHeatMap = nmsHeatMaps[jointId]; - const float* heatMapData = nmsHeatMap.ptr(); - cv::Size outputSize = nmsHeatMap.size(); - - std::vector indices(outputSize.area()); - std::iota(std::begin(indices), std::end(indices), 0); - std::partial_sort(std::begin(indices), - std::begin(indices) + maxNumPeople, - std::end(indices), - [heatMapData](int l, int r) { - return heatMapData[l] > heatMapData[r]; - }); - - for (size_t personId = 0; personId < maxNumPeople; personId++) { - int index = indices[personId]; - int x = index / outputSize.width; - int y = index % outputSize.width; - float tag = aembdsMaps[jointId].at(x, y); - float score = heatMapData[index]; - allPeaks[jointId].reserve(maxNumPeople); - if (score > detectionThreshold) { - allPeaks[jointId].emplace_back(Peak{cv::Point2f(static_cast(x), static_cast(y)), score, tag}); - } - } -} - -std::vector matchByTag(std::vector>& allPeaks, - size_t maxNumPeople, - size_t numJoints, - float tagThreshold) { - std::vector allPoses; - for (size_t jointId : {0, 1, 2, 3, 4, 5, 6, 11, 12, 7, 8, 9, 10, 13, 14, 15, 16}) { - std::vector& jointPeaks = allPeaks[jointId]; - std::vector tags; - tags.reserve(jointPeaks.size()); - for (const Peak& peak : jointPeaks) { - tags.push_back(peak.tag); - } - if (allPoses.empty()) { - for (size_t personId = 0; personId < jointPeaks.size(); personId++) { - Peak peak = jointPeaks[personId]; - Pose pose = Pose(numJoints); - pose.add(jointId, peak); - allPoses.push_back(pose); - } - continue; - } - if (jointPeaks.empty() || (allPoses.size() == maxNumPeople)) { - continue; - } - std::vector posesTags; - std::vector posesCenters; - for (auto& pose : allPoses) { - posesTags.push_back(pose.getPoseTag()); - posesCenters.push_back(pose.getPoseCenter()); - } - size_t numAdded = tags.size(); - size_t numGrouped = posesTags.size(); - cv::Mat tagsDiff(numAdded, numGrouped, CV_32F); - cv::Mat matchingCost(numAdded, numGrouped, CV_32F); - std::vector dists(numAdded); - for (size_t j = 0; j < numGrouped; j++) { - float minDist = std::numeric_limits::max(); - // Compute euclidean distance (in spatial space) between the pose center and all joints. - const cv::Point2f center = posesCenters.at(j); - for (size_t i = 0; i < numAdded; i++) { - cv::Point2f v = jointPeaks.at(i).keypoint - center; - float dist = std::sqrt(v.x * v.x + v.y * v.y); - dists[i] = dist; - minDist = std::min(dist, minDist); - } - // Compute semantic distance (in embedding space) between the pose tag and all joints - // and corresponding matching costs. - auto poseTag = posesTags[j]; - for (size_t i = 0; i < numAdded; i++) { - float diff = static_cast(cv::norm(tags[i] - poseTag)); - tagsDiff.at(i, j) = diff; - if (diff < tagThreshold) { - diff *= dists[i] / (minDist + 1e-10f); - } - matchingCost.at(i, j) = std::round(diff) * 100 - jointPeaks[i].score; - } - } - - if (numAdded > numGrouped) { - cv::copyMakeBorder(matchingCost, - matchingCost, - 0, - 0, - 0, - numAdded - numGrouped, - cv::BORDER_CONSTANT, - 10000000); - } - // Get pairs - auto res = KuhnMunkres().Solve(matchingCost); - for (size_t row = 0; row < res.size(); row++) { - size_t col = res[row]; - if (row < numAdded && col < numGrouped && tagsDiff.at(row, col) < tagThreshold) { - allPoses[col].add(jointId, jointPeaks[row]); - } else { - Pose pose = Pose(numJoints); - pose.add(jointId, jointPeaks[row]); - allPoses.push_back(pose); - } - } - } - return allPoses; -} - -namespace { -cv::Point2f adjustLocation(const int x, const int y, const cv::Mat& heatMap) { - cv::Point2f delta(0.f, 0.f); - int width = heatMap.cols; - int height = heatMap.rows; - if ((1 < x) && (x < width - 1) && (1 < y) && (y < height - 1)) { - auto diffX = heatMap.at(y, x + 1) - heatMap.at(y, x - 1); - auto diffY = heatMap.at(y + 1, x) - heatMap.at(y - 1, x); - delta.x = diffX > 0 ? 0.25f : -0.25f; - delta.y = diffY > 0 ? 0.25f : -0.25f; - } - return delta; -} -} // namespace - -void adjustAndRefine(std::vector& allPoses, - const std::vector& heatMaps, - const std::vector& aembdsMaps, - int poseId, - const float delta) { - Pose& pose = allPoses[poseId]; - float poseTag = pose.getPoseTag(); - for (size_t jointId = 0; jointId < pose.size(); jointId++) { - Peak& peak = pose.getPeak(jointId); - const cv::Mat& heatMap = heatMaps[jointId]; - const cv::Mat& aembds = aembdsMaps[jointId]; - - if (peak.score > 0) { - // Adjust - int x = static_cast(peak.keypoint.x); - int y = static_cast(peak.keypoint.y); - peak.keypoint += adjustLocation(x, y, heatMap); - if (delta) { - peak.keypoint.x += delta; - peak.keypoint.y += delta; - } - } else { - // Refine - // Get position with the closest tag value to the pose tag - cv::Mat diff = cv::abs(aembds - poseTag); - diff.convertTo(diff, CV_32S, 1.0, 0.0); - diff.convertTo(diff, CV_32F); - diff -= heatMap; - double min; - cv::Point2i minLoc; - cv::minMaxLoc(diff, &min, 0, &minLoc); - int x = minLoc.x; - int y = minLoc.y; - float val = heatMap.at(y, x); - if (val > 0) { - peak.keypoint.x = static_cast(x); - peak.keypoint.y = static_cast(y); - peak.keypoint += adjustLocation(x, y, heatMap); - // Peak score is assigned directly, so it does not affect the pose score. - peak.score = val; - } - } - } -} diff --git a/model_api/cpp/models/src/deblurring_model.cpp b/model_api/cpp/models/src/deblurring_model.cpp deleted file mode 100644 index c3543042..00000000 --- a/model_api/cpp/models/src/deblurring_model.cpp +++ /dev/null @@ -1,158 +0,0 @@ -/* -// Copyright (C) 2021-2024 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "models/deblurring_model.h" - -#include -#include -#include -#include - -#include -#include - -#include -#include - -#include "models/input_data.h" -#include "models/internal_model_data.h" -#include "models/results.h" - -DeblurringModel::DeblurringModel(const std::string& modelFile, - const cv::Size& inputImgSize, - const std::string& layout) - : ImageModel(modelFile, "standard", false, layout) { - netInputHeight = inputImgSize.height; - netInputWidth = inputImgSize.width; -} - -void DeblurringModel::prepareInputsOutputs(std::shared_ptr& model) { - // --------------------------- Configure input & output ------------------------------------------------- - // --------------------------- Prepare input ------------------------------------------------------ - if (model->inputs().size() != 1) { - throw std::logic_error("Deblurring model wrapper supports topologies with only 1 input"); - } - - inputNames.push_back(model->input().get_any_name()); - - const ov::Shape& inputShape = model->input().get_shape(); - const ov::Layout& inputLayout = getInputLayout(model->input()); - - if (inputShape.size() != 4 || inputShape[ov::layout::batch_idx(inputLayout)] != 1 || - inputShape[ov::layout::channels_idx(inputLayout)] != 3) { - throw std::logic_error("3-channel 4-dimensional model's input is expected"); - } - - ov::preprocess::PrePostProcessor ppp(model); - ppp.input().tensor().set_element_type(ov::element::u8).set_layout("NHWC"); - - ppp.input().model().set_layout(inputLayout); - - // --------------------------- Prepare output ----------------------------------------------------- - if (model->outputs().size() != 1) { - throw std::logic_error("Deblurring model wrapper supports topologies with only 1 output"); - } - - outputNames.push_back(model->output().get_any_name()); - - const ov::Shape& outputShape = model->output().get_shape(); - const ov::Layout outputLayout("NCHW"); - if (outputShape.size() != 4 || outputShape[ov::layout::batch_idx(outputLayout)] != 1 || - outputShape[ov::layout::channels_idx(outputLayout)] != 3) { - throw std::logic_error("3-channel 4-dimensional model's output is expected"); - } - - ppp.output().tensor().set_element_type(ov::element::f32); - model = ppp.build(); - - changeInputSize(model); -} - -void DeblurringModel::changeInputSize(std::shared_ptr& model) { - const ov::Layout& layout = ov::layout::get_layout(model->input()); - ov::Shape inputShape = model->input().get_shape(); - - const auto batchId = ov::layout::batch_idx(layout); - const auto heightId = ov::layout::height_idx(layout); - const auto widthId = ov::layout::width_idx(layout); - - if (inputShape[heightId] % stride || inputShape[widthId] % stride) { - throw std::logic_error("Model input shape HxW = " + std::to_string(inputShape[heightId]) + "x" + - std::to_string(inputShape[widthId]) + "must be divisible by stride " + - std::to_string(stride)); - } - - netInputHeight = static_cast((netInputHeight + stride - 1) / stride) * stride; - netInputWidth = static_cast((netInputWidth + stride - 1) / stride) * stride; - - inputShape[batchId] = 1; - inputShape[heightId] = netInputHeight; - inputShape[widthId] = netInputWidth; - - model->reshape(inputShape); -} - -std::shared_ptr DeblurringModel::preprocess(const InputData& inputData, InferenceInput& input) { - auto& image = inputData.asRef().inputImage; - size_t h = image.rows; - size_t w = image.cols; - cv::Mat resizedImage; - - if (netInputHeight - stride < h && h <= netInputHeight && netInputWidth - stride < w && w <= netInputWidth) { - int bottom = netInputHeight - h; - int right = netInputWidth - w; - cv::copyMakeBorder(image, resizedImage, 0, bottom, 0, right, cv::BORDER_CONSTANT, 0); - } else { - slog::warn << "\tChosen model aspect ratio doesn't match image aspect ratio" << slog::endl; - cv::resize(image, resizedImage, cv::Size(netInputWidth, netInputHeight)); - } - input.emplace(inputNames[0], wrapMat2Tensor(resizedImage)); - - return std::make_shared(image.cols, image.rows); -} - -std::unique_ptr DeblurringModel::postprocess(InferenceResult& infResult) { - ImageResult* result = new ImageResult; - *static_cast(result) = static_cast(infResult); - - const auto& inputImgSize = infResult.internalModelData->asRef(); - const auto outputData = infResult.getFirstOutputTensor().data(); - - std::vector imgPlanes; - const ov::Shape& outputShape = infResult.getFirstOutputTensor().get_shape(); - const ov::Layout outputLayout("NCHW"); - size_t outHeight = static_cast((outputShape[ov::layout::height_idx(outputLayout)])); - size_t outWidth = static_cast((outputShape[ov::layout::width_idx(outputLayout)])); - size_t numOfPixels = outWidth * outHeight; - imgPlanes = std::vector{cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[0])), - cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels])), - cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels * 2]))}; - cv::Mat resultImg; - cv::merge(imgPlanes, resultImg); - - if (netInputHeight - stride < static_cast(inputImgSize.inputImgHeight) && - static_cast(inputImgSize.inputImgHeight) <= netInputHeight && - netInputWidth - stride < static_cast(inputImgSize.inputImgWidth) && - static_cast(inputImgSize.inputImgWidth) <= netInputWidth) { - result->resultImage = resultImg(cv::Rect(0, 0, inputImgSize.inputImgWidth, inputImgSize.inputImgHeight)); - } else { - cv::resize(resultImg, result->resultImage, cv::Size(inputImgSize.inputImgWidth, inputImgSize.inputImgHeight)); - } - - result->resultImage.convertTo(result->resultImage, CV_8UC3, 255); - - return std::unique_ptr(result); -} diff --git a/model_api/cpp/models/src/detection_model.cpp b/model_api/cpp/models/src/detection_model.cpp index 97d6a8f1..efb0c23b 100644 --- a/model_api/cpp/models/src/detection_model.cpp +++ b/model_api/cpp/models/src/detection_model.cpp @@ -15,10 +15,6 @@ */ #include "models/detection_model.h" -#include "models/detection_model_centernet.h" -#include "models/detection_model_faceboxes.h" -#include "models/detection_model_retinaface.h" -#include "models/detection_model_retinaface_pt.h" #include "models/detection_model_ssd.h" #include "models/detection_model_yolo.h" #include "models/detection_model_yolov3_onnx.h" @@ -75,18 +71,10 @@ std::unique_ptr DetectionModel::create_model(const std::string& } std::unique_ptr detectionModel; - if (model_type == ModelFaceBoxes::ModelType) { - detectionModel = std::unique_ptr(new ModelFaceBoxes(model, configuration)); - } else if (model_type == ModelRetinaFace::ModelType) { - detectionModel = std::unique_ptr(new ModelRetinaFace(model, configuration)); - } else if (model_type == ModelRetinaFacePT::ModelType) { - detectionModel = std::unique_ptr(new ModelRetinaFacePT(model, configuration)); - } else if (model_type == ModelSSD::ModelType || model_type == "SSD") { + if (model_type == ModelSSD::ModelType || model_type == "SSD") { detectionModel = std::unique_ptr(new ModelSSD(model, configuration)); } else if (model_type == ModelYoloX::ModelType) { detectionModel = std::unique_ptr(new ModelYoloX(model, configuration)); - } else if (model_type == ModelCenterNet::ModelType) { - detectionModel = std::unique_ptr(new ModelCenterNet(model, configuration)); } else if (model_type == YOLOv5::ModelType) { detectionModel = std::unique_ptr(new YOLOv5(model, configuration)); } else if (model_type == YOLOv8::ModelType) { @@ -111,18 +99,10 @@ std::unique_ptr DetectionModel::create_model(std::shared_ptr detectionModel; - if (model_type == ModelFaceBoxes::ModelType) { - detectionModel = std::unique_ptr(new ModelFaceBoxes(adapter)); - } else if (model_type == ModelRetinaFace::ModelType) { - detectionModel = std::unique_ptr(new ModelRetinaFace(adapter)); - } else if (model_type == ModelRetinaFacePT::ModelType) { - detectionModel = std::unique_ptr(new ModelRetinaFacePT(adapter)); - } else if (model_type == ModelSSD::ModelType || model_type == "SSD") { + if (model_type == ModelSSD::ModelType || model_type == "SSD") { detectionModel = std::unique_ptr(new ModelSSD(adapter)); } else if (model_type == ModelYoloX::ModelType) { detectionModel = std::unique_ptr(new ModelYoloX(adapter)); - } else if (model_type == ModelCenterNet::ModelType) { - detectionModel = std::unique_ptr(new ModelCenterNet(adapter)); } else { throw std::runtime_error("Incorrect or unsupported model_type is provided: " + model_type); } diff --git a/model_api/cpp/models/src/detection_model_centernet.cpp b/model_api/cpp/models/src/detection_model_centernet.cpp deleted file mode 100644 index 05ef808e..00000000 --- a/model_api/cpp/models/src/detection_model_centernet.cpp +++ /dev/null @@ -1,330 +0,0 @@ -/* -// Copyright (C) 2020-2024 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "models/detection_model_centernet.h" - -#include -#include -#include - -#include -#include -#include - -#include "models/input_data.h" -#include "models/internal_model_data.h" -#include "models/results.h" - -std::string ModelCenterNet::ModelType = "centernet"; - -void ModelCenterNet::initDefaultParameters(const ov::AnyMap&) { - resizeMode = RESIZE_KEEP_ASPECT_LETTERBOX; // Ignore configuration for now - useAutoResize = false; -} - -ModelCenterNet::ModelCenterNet(std::shared_ptr& model, const ov::AnyMap& configuration) - : DetectionModel(model, configuration) { - initDefaultParameters(configuration); -} - -ModelCenterNet::ModelCenterNet(std::shared_ptr& adapter) - : DetectionModel(adapter) { - const ov::AnyMap& configuration = adapter->getModelConfig(); - initDefaultParameters(configuration); -} - - -void ModelCenterNet::updateModelInfo() { - DetectionModel::updateModelInfo(); - - model->set_rt_info(ModelCenterNet::ModelType, "model_info", "model_type"); -} - -void ModelCenterNet::prepareInputsOutputs(std::shared_ptr& model) { - // --------------------------- Configure input & output ------------------------------------------------- - // --------------------------- Prepare input ------------------------------------------------------ - if (model->inputs().size() != 1) { - throw std::logic_error("CenterNet model wrapper expects models that have only 1 input"); - } - - const ov::Shape& inputShape = model->input().get_shape(); - const ov::Layout& inputLayout = getInputLayout(model->input()); - - if (inputShape[ov::layout::channels_idx(inputLayout)] != 3) { - throw std::logic_error("Expected 3-channel input"); - } - - ov::preprocess::PrePostProcessor ppp(model); - inputTransform.setPrecision(ppp, model->input().get_any_name()); - ppp.input().tensor().set_layout("NHWC"); - - ppp.input().model().set_layout(inputLayout); - - // --------------------------- Reading image input parameters ------------------------------------------- - inputNames.push_back(model->input().get_any_name()); - netInputWidth = inputShape[ov::layout::width_idx(inputLayout)]; - netInputHeight = inputShape[ov::layout::height_idx(inputLayout)]; - - // --------------------------- Prepare output ----------------------------------------------------- - if (model->outputs().size() != 3) { - throw std::logic_error("CenterNet model wrapper expects models that have 3 outputs"); - } - - const ov::Layout outLayout{"NCHW"}; - for (const auto& output : model->outputs()) { - auto outTensorName = output.get_any_name(); - outputNames.push_back(outTensorName); - ppp.output(outTensorName).tensor().set_element_type(ov::element::f32).set_layout(outLayout); - } - std::sort(outputNames.begin(), outputNames.end()); - model = ppp.build(); -} - -cv::Point2f getDir(const cv::Point2f& srcPoint, float rotRadius) { - float sn = sinf(rotRadius); - float cs = cosf(rotRadius); - - cv::Point2f srcResult(0.0f, 0.0f); - srcResult.x = srcPoint.x * cs - srcPoint.y * sn; - srcResult.y = srcPoint.x * sn + srcPoint.y * cs; - - return srcResult; -} - -cv::Point2f get3rdPoint(const cv::Point2f& a, const cv::Point2f& b) { - cv::Point2f direct = a - b; - return b + cv::Point2f(-direct.y, direct.x); -} - -cv::Mat getAffineTransform(float centerX, - float centerY, - int srcW, - float rot, - size_t outputWidth, - size_t outputHeight, - bool inv = false) { - float rotRad = static_cast(CV_PI) * rot / 180.0f; - auto srcDir = getDir({0.0f, -0.5f * srcW}, rotRad); - cv::Point2f dstDir(0.0f, -0.5f * outputWidth); - std::vector src(3, {0.0f, 0.0f}); - std::vector dst(3, {0.0f, 0.0f}); - - src[0] = {centerX, centerY}; - src[1] = srcDir + src[0]; - src[2] = get3rdPoint(src[0], src[1]); - - dst[0] = {outputWidth * 0.5f, outputHeight * 0.5f}; - dst[1] = dst[0] + dstDir; - dst[2] = get3rdPoint(dst[0], dst[1]); - - cv::Mat trans; - if (inv) { - trans = cv::getAffineTransform(dst, src); - } else { - trans = cv::getAffineTransform(src, dst); - } - - return trans; -} - -std::shared_ptr ModelCenterNet::preprocess(const InputData& inputData, InferenceInput& input) { - auto& img = inputData.asRef().inputImage; - const auto& resizedImg = resizeImageExt(img, netInputWidth, netInputHeight, RESIZE_KEEP_ASPECT_LETTERBOX); - - input.emplace(inputNames[0], wrapMat2Tensor(inputTransform(resizedImg))); - return std::make_shared(img.cols, img.rows); -} - -namespace { -std::vector> nms(float* scoresPtr, const ov::Shape& shape, float threshold, int kernel = 3) { - std::vector> scores; - constexpr size_t INIT_VECTOR_SIZE = 200; - scores.reserve(INIT_VECTOR_SIZE); - auto chSize = shape[2] * shape[3]; - - for (size_t i = 0; i < shape[1] * shape[2] * shape[3]; ++i) { - scoresPtr[i] = expf(scoresPtr[i]) / (1 + expf(scoresPtr[i])); - } - - for (size_t ch = 0; ch < shape[1]; ++ch) { - for (size_t w = 0; w < shape[2]; ++w) { - for (size_t h = 0; h < shape[3]; ++h) { - float max = scoresPtr[chSize * ch + shape[2] * w + h]; - - // --------------------- filter on threshold-------------------------------------- - if (max < threshold) { - continue; - } - - // --------------------- store index and score------------------------------------ - scores.push_back({chSize * ch + shape[2] * w + h, max}); - - bool next = true; - // ---------------------- maxpool2d ----------------------------------------------- - for (int i = -kernel / 2; i < kernel / 2 + 1 && next; ++i) { - for (int j = -kernel / 2; j < kernel / 2 + 1; ++j) { - if (int(w) + i >= 0 && w + i < shape[2] && int(h) + j >= 0 && h + j < shape[3]) { - if (scoresPtr[chSize * ch + shape[2] * (w + i) + h + j] > max) { - scores.pop_back(); - next = false; - break; - } - } else { - if (max < 0) { - scores.pop_back(); - next = false; - break; - } - } - } - } - } - } - } - - return scores; -} - -static std::vector> filterScores(const ov::Tensor& scoresTensor, float threshold) { - auto shape = scoresTensor.get_shape(); - float* scoresPtr = scoresTensor.data(); - - return nms(scoresPtr, shape, threshold); -} - -std::vector> filterReg(const ov::Tensor& regressionTensor, - const std::vector>& scores, - size_t chSize) { - const float* regPtr = regressionTensor.data(); - std::vector> reg; - - for (const auto& s : scores) { - reg.emplace_back(regPtr[s.first % chSize], regPtr[chSize + s.first % chSize]); - } - - return reg; -} - -std::vector> filterWH(const ov::Tensor& whTensor, - const std::vector>& scores, - size_t chSize) { - const float* whPtr = whTensor.data(); - std::vector> wh; - - for (const auto& s : scores) { - wh.emplace_back(whPtr[s.first % chSize], whPtr[chSize + s.first % chSize]); - } - - return wh; -} - -struct BBox { - float left; - float top; - float right; - float bottom; - - float getWidth() const { - return (right - left) + 1.0f; - } - float getHeight() const { - return (bottom - top) + 1.0f; - } -}; - -std::vector calcBoxes(const std::vector>& scores, - const std::vector>& reg, - const std::vector>& wh, - const ov::Shape& shape) { - std::vector boxes(scores.size()); - - for (size_t i = 0; i < boxes.size(); ++i) { - size_t chIdx = scores[i].first % (shape[2] * shape[3]); - auto xCenter = chIdx % shape[3]; - auto yCenter = chIdx / shape[3]; - - boxes[i].left = xCenter + reg[i].first - wh[i].first / 2.0f; - boxes[i].top = yCenter + reg[i].second - wh[i].second / 2.0f; - boxes[i].right = xCenter + reg[i].first + wh[i].first / 2.0f; - boxes[i].bottom = yCenter + reg[i].second + wh[i].second / 2.0f; - } - - return boxes; -} - -void transform(std::vector& boxes, - const ov::Shape& shape, - int scale, - float centerX, - float centerY) { - cv::Mat1f trans = getAffineTransform(centerX, centerY, scale, 0, shape[2], shape[3], true); - - for (auto& b : boxes) { - BBox newbb; - - newbb.left = trans.at(0, 0) * b.left + trans.at(0, 1) * b.top + trans.at(0, 2); - newbb.top = trans.at(1, 0) * b.left + trans.at(1, 1) * b.top + trans.at(1, 2); - newbb.right = trans.at(0, 0) * b.right + trans.at(0, 1) * b.bottom + trans.at(0, 2); - newbb.bottom = trans.at(1, 0) * b.right + trans.at(1, 1) * b.bottom + trans.at(1, 2); - - b = newbb; - } -} -} - -std::unique_ptr ModelCenterNet::postprocess(InferenceResult& infResult) { - // --------------------------- Filter data and get valid indices --------------------------------- - const auto& heatmapTensor = infResult.outputsData[outputNames[0]]; - const auto& heatmapTensorShape = heatmapTensor.get_shape(); - const auto chSize = heatmapTensorShape[2] * heatmapTensorShape[3]; - const auto scores = filterScores(heatmapTensor, confidence_threshold); - - const auto& regressionTensor = infResult.outputsData[outputNames[1]]; - const auto reg = filterReg(regressionTensor, scores, chSize); - - const auto& whTensor = infResult.outputsData[outputNames[2]]; - const auto wh = filterWH(whTensor, scores, chSize); - - // --------------------------- Calculate bounding boxes & apply inverse affine transform ---------- - auto boxes = calcBoxes(scores, reg, wh, heatmapTensorShape); - - const auto imgWidth = infResult.internalModelData->asRef().inputImgWidth; - const auto imgHeight = infResult.internalModelData->asRef().inputImgHeight; - const auto scale = std::max(imgWidth, imgHeight); - const float centerX = imgWidth / 2.0f; - const float centerY = imgHeight / 2.0f; - - transform(boxes, heatmapTensorShape, scale, centerX, centerY); - - // --------------------------- Create detection result objects ------------------------------------ - DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData); - - result->objects.reserve(scores.size()); - for (size_t i = 0; i < scores.size(); ++i) { - DetectedObject desc; - desc.confidence = scores[i].second; - desc.labelID = scores[i].first / chSize; - desc.label = getLabelName(desc.labelID); - desc.x = clamp(boxes[i].left, 0.f, static_cast(imgWidth)); - desc.y = clamp(boxes[i].top, 0.f, static_cast(imgHeight)); - desc.width = clamp(boxes[i].getWidth(), 0.f, static_cast(imgWidth)); - desc.height = clamp(boxes[i].getHeight(), 0.f, static_cast(imgHeight)); - - result->objects.push_back(desc); - } - - return std::unique_ptr(result); -} diff --git a/model_api/cpp/models/src/detection_model_faceboxes.cpp b/model_api/cpp/models/src/detection_model_faceboxes.cpp deleted file mode 100644 index 2f2ff48e..00000000 --- a/model_api/cpp/models/src/detection_model_faceboxes.cpp +++ /dev/null @@ -1,270 +0,0 @@ -/* -// Copyright (C) 2020-2024 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "models/detection_model_faceboxes.h" - -#include -#include -#include -#include - -#include - -#include -#include -#include - -#include "models/internal_model_data.h" -#include "models/results.h" - -std::string ModelFaceBoxes::ModelType = "faceboxes"; - -void ModelFaceBoxes::initDefaultParameters(const ov::AnyMap& configuration) { - resizeMode = RESIZE_FILL; // Ignore resize_type for now - auto labels_string = configuration.find("labels"); // Override default if it is not set - if (labels_string == configuration.end()) { - labels = {"Face"}; - } -} - -ModelFaceBoxes::ModelFaceBoxes(std::shared_ptr& model, const ov::AnyMap& configuration) - : DetectionModelExt(model, configuration) { - initDefaultParameters(configuration); -} - -ModelFaceBoxes::ModelFaceBoxes(std::shared_ptr& adapter) - : DetectionModelExt(adapter) { - const ov::AnyMap& configuration = adapter->getModelConfig(); - initDefaultParameters(configuration); -} - -void ModelFaceBoxes::updateModelInfo() { - DetectionModelExt::updateModelInfo(); - - model->set_rt_info(ModelFaceBoxes::ModelType, "model_info", "model_type"); -} - -void ModelFaceBoxes::prepareInputsOutputs(std::shared_ptr& model) { - // --------------------------- Configure input & output ------------------------------------------------- - // --------------------------- Prepare input ------------------------------------------------------ - if (model->inputs().size() != 1) { - throw std::logic_error("FaceBoxes model wrapper expects models that have only 1 input"); - } - - const ov::Shape& inputShape = model->input().get_shape(); - const ov::Layout& inputLayout = getInputLayout(model->input()); - - if (inputShape[ov::layout::channels_idx(inputLayout)] != 3) { - throw std::logic_error("Expected 3-channel input"); - } - - ov::preprocess::PrePostProcessor ppp(model); - inputTransform.setPrecision(ppp, model->input().get_any_name()); - ppp.input().tensor().set_layout({"NHWC"}); - - if (useAutoResize) { - ppp.input().tensor().set_spatial_dynamic_shape(); - - ppp.input() - .preprocess() - .convert_element_type(ov::element::f32) - .resize(ov::preprocess::ResizeAlgorithm::RESIZE_LINEAR); - } - - ppp.input().model().set_layout(inputLayout); - - // --------------------------- Reading image input parameters ------------------------------------------- - inputNames.push_back(model->input().get_any_name()); - netInputWidth = inputShape[ov::layout::width_idx(inputLayout)]; - netInputHeight = inputShape[ov::layout::height_idx(inputLayout)]; - - // --------------------------- Prepare output ----------------------------------------------------- - if (model->outputs().size() != 2) { - throw std::logic_error("FaceBoxes model wrapper expects models that have 2 outputs"); - } - - const ov::Layout outputLayout{"CHW"}; - maxProposalsCount = model->outputs().front().get_shape()[ov::layout::height_idx(outputLayout)]; - for (const auto& output : model->outputs()) { - const auto outTensorName = output.get_any_name(); - outputNames.push_back(outTensorName); - ppp.output(outTensorName).tensor().set_element_type(ov::element::f32).set_layout(outputLayout); - } - std::sort(outputNames.begin(), outputNames.end()); - model = ppp.build(); - - // --------------------------- Calculating anchors ---------------------------------------------------- - std::vector> featureMaps; - for (auto s : steps) { - featureMaps.push_back({netInputHeight / s, netInputWidth / s}); - } - - priorBoxes(featureMaps); -} - -void calculateAnchors(std::vector& anchors, - const std::vector& vx, - const std::vector& vy, - const int minSize, - const int step) { - float skx = static_cast(minSize); - float sky = static_cast(minSize); - - std::vector dense_cx, dense_cy; - - std::transform(vx.begin(), vx.end(), std::back_inserter(dense_cx), [step](float x){return x * step;}); - std::transform(vy.begin(), vy.end(), std::back_inserter(dense_cy), [step](float y){return y * step;}); - - for (auto cy : dense_cy) { - for (auto cx : dense_cx) { - anchors.push_back( - {cx - 0.5f * skx, cy - 0.5f * sky, cx + 0.5f * skx, cy + 0.5f * sky}); // left top right bottom - } - } -} - -void calculateAnchorsZeroLevel(std::vector& anchors, - const int fx, - const int fy, - const std::vector& minSizes, - const int step) { - for (auto s : minSizes) { - std::vector vx, vy; - if (s == 32) { - vx.push_back(static_cast(fx)); - vx.push_back(fx + 0.25f); - vx.push_back(fx + 0.5f); - vx.push_back(fx + 0.75f); - - vy.push_back(static_cast(fy)); - vy.push_back(fy + 0.25f); - vy.push_back(fy + 0.5f); - vy.push_back(fy + 0.75f); - } else if (s == 64) { - vx.push_back(static_cast(fx)); - vx.push_back(fx + 0.5f); - - vy.push_back(static_cast(fy)); - vy.push_back(fy + 0.5f); - } else { - vx.push_back(fx + 0.5f); - vy.push_back(fy + 0.5f); - } - calculateAnchors(anchors, vx, vy, s, step); - } -} - -void ModelFaceBoxes::priorBoxes(const std::vector>& featureMaps) { - anchors.reserve(maxProposalsCount); - - for (size_t k = 0; k < featureMaps.size(); ++k) { - for (size_t i = 0; i < featureMaps[k].first; ++i) { - for (size_t j = 0; j < featureMaps[k].second; ++j) { - if (k == 0) { - calculateAnchorsZeroLevel(anchors, j, i, minSizes[k], steps[k]); - } else { - calculateAnchors(anchors, {j + 0.5f}, {i + 0.5f}, minSizes[k][0], steps[k]); - } - } - } - } -} - -std::pair, std::vector> filterScores(const ov::Tensor& scoresTensor, - const float confidence_threshold) { - auto shape = scoresTensor.get_shape(); - const float* scoresPtr = scoresTensor.data(); - - std::vector indices; - std::vector scores; - scores.reserve(ModelFaceBoxes::INIT_VECTOR_SIZE); - indices.reserve(ModelFaceBoxes::INIT_VECTOR_SIZE); - for (size_t i = 1; i < shape[1] * shape[2]; i = i + 2) { - if (scoresPtr[i] > confidence_threshold) { - indices.push_back(i / 2); - scores.push_back(scoresPtr[i]); - } - } - - return {indices, scores}; -} - -std::vector filterBoxes(const ov::Tensor& boxesTensor, - const std::vector& anchors, - const std::vector& validIndices, - const std::vector& variance) { - auto shape = boxesTensor.get_shape(); - const float* boxesPtr = boxesTensor.data(); - - std::vector boxes; - boxes.reserve(ModelFaceBoxes::INIT_VECTOR_SIZE); - for (auto i : validIndices) { - auto objStart = shape[2] * i; - - auto dx = boxesPtr[objStart]; - auto dy = boxesPtr[objStart + 1]; - auto dw = boxesPtr[objStart + 2]; - auto dh = boxesPtr[objStart + 3]; - - auto predCtrX = dx * variance[0] * anchors[i].getWidth() + anchors[i].getXCenter(); - auto predCtrY = dy * variance[0] * anchors[i].getHeight() + anchors[i].getYCenter(); - auto predW = exp(dw * variance[1]) * anchors[i].getWidth(); - auto predH = exp(dh * variance[1]) * anchors[i].getHeight(); - - boxes.push_back({static_cast(predCtrX - 0.5f * predW), - static_cast(predCtrY - 0.5f * predH), - static_cast(predCtrX + 0.5f * predW), - static_cast(predCtrY + 0.5f * predH)}); - } - - return boxes; -} - -std::unique_ptr ModelFaceBoxes::postprocess(InferenceResult& infResult) { - // Filter scores and get valid indices for bounding boxes - const auto scoresTensor = infResult.outputsData[outputNames[1]]; - const auto scores = filterScores(scoresTensor, confidence_threshold); - - // Filter bounding boxes on indices - auto boxesTensor = infResult.outputsData[outputNames[0]]; - std::vector boxes = filterBoxes(boxesTensor, anchors, scores.first, variance); - - // Apply Non-maximum Suppression - const std::vector& keep = nms(boxes, scores.second, iou_threshold); - - // Create detection result objects - DetectionResult* result = new DetectionResult(infResult.frameId, infResult.metaData); - const auto imgWidth = infResult.internalModelData->asRef().inputImgWidth; - const auto imgHeight = infResult.internalModelData->asRef().inputImgHeight; - const float scaleX = static_cast(netInputWidth) / imgWidth; - const float scaleY = static_cast(netInputHeight) / imgHeight; - - result->objects.reserve(keep.size()); - for (auto i : keep) { - DetectedObject desc; - desc.confidence = scores.second[i]; - desc.x = clamp(boxes[i].left / scaleX, 0.f, static_cast(imgWidth)); - desc.y = clamp(boxes[i].top / scaleY, 0.f, static_cast(imgHeight)); - desc.width = clamp(boxes[i].getWidth() / scaleX, 0.f, static_cast(imgWidth)); - desc.height = clamp(boxes[i].getHeight() / scaleY, 0.f, static_cast(imgHeight)); - desc.labelID = 0; - desc.label = labels[0]; - - result->objects.push_back(desc); - } - - return std::unique_ptr(result); -} diff --git a/model_api/cpp/models/src/detection_model_retinaface.cpp b/model_api/cpp/models/src/detection_model_retinaface.cpp deleted file mode 100644 index afaaf7a4..00000000 --- a/model_api/cpp/models/src/detection_model_retinaface.cpp +++ /dev/null @@ -1,408 +0,0 @@ -/* -// Copyright (C) 2020-2024 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "models/detection_model_retinaface.h" - -#include - -#include -#include -#include - -#include -#include - -#include -#include - -#include "models/internal_model_data.h" -#include "models/results.h" - -std::string ModelRetinaFace::ModelType = "retinaface"; - -void ModelRetinaFace::initDefaultParameters(const ov::AnyMap& configuration) { - resizeMode = RESIZE_FILL; // Ignore resize_type for now - auto labels_string = configuration.find("labels"); // Override default if it is not set - if (labels_string == configuration.end()) { - labels = {"Face"}; - } - - generateAnchorsFpn(); -} - -ModelRetinaFace::ModelRetinaFace(std::shared_ptr& model, const ov::AnyMap& configuration) - : DetectionModelExt(model, configuration) { - initDefaultParameters(configuration); -} - -ModelRetinaFace::ModelRetinaFace(std::shared_ptr& adapter) - : DetectionModelExt(adapter) { - const ov::AnyMap& configuration = adapter->getModelConfig(); - initDefaultParameters(configuration); -} - -void ModelRetinaFace::updateModelInfo() { - DetectionModelExt::updateModelInfo(); - - model->set_rt_info(ModelRetinaFace::ModelType, "model_info", "model_type"); -} - -void ModelRetinaFace::prepareInputsOutputs(std::shared_ptr& model) { - // --------------------------- Configure input & output ------------------------------------------------- - // --------------------------- Prepare input ------------------------------------------------------ - if (model->inputs().size() != 1) { - throw std::logic_error("RetinaFace model wrapper expects models that have only 1 input"); - } - const ov::Shape& inputShape = model->input().get_partial_shape().get_max_shape(); - const ov::Layout& inputLayout = getInputLayout(model->input()); - - if (inputShape[ov::layout::channels_idx(inputLayout)] != 3) { - throw std::logic_error("Expected 3-channel input"); - } - - ov::preprocess::PrePostProcessor ppp(model); - ppp.input().tensor().set_element_type(ov::element::u8).set_layout({"NHWC"}); - - if (useAutoResize) { - ppp.input().tensor().set_spatial_dynamic_shape(); - - ppp.input() - .preprocess() - .convert_element_type(ov::element::f32) - .resize(ov::preprocess::ResizeAlgorithm::RESIZE_LINEAR); - } - - ppp.input().model().set_layout(inputLayout); - - // --------------------------- Reading image input parameters ------------------------------------------- - inputNames.push_back(model->input().get_any_name()); - netInputWidth = inputShape[ov::layout::width_idx(inputLayout)]; - netInputHeight = inputShape[ov::layout::height_idx(inputLayout)]; - - // --------------------------- Prepare output ----------------------------------------------------- - - const ov::OutputVector& outputs = model->outputs(); - if (outputs.size() != 6 && outputs.size() != 9 && outputs.size() != 12) { - throw std::logic_error("RetinaFace model wrapper expects models that have 6, 9 or 12 outputs"); - } - - const ov::Layout outputLayout{"NCHW"}; - std::vector outputsSizes[OUT_MAX]; - for (const auto& output : model->outputs()) { - auto outTensorName = output.get_any_name(); - outputNames.push_back(outTensorName); - ppp.output(outTensorName).tensor().set_element_type(ov::element::f32).set_layout(outputLayout); - - OutputType type = OUT_MAX; - if (outTensorName.find("box") != std::string::npos) { - type = OUT_BOXES; - } else if (outTensorName.find("cls") != std::string::npos) { - type = OUT_SCORES; - } else if (outTensorName.find("landmark") != std::string::npos) { - type = OUT_LANDMARKS; - shouldDetectLandmarks = true; - } else if (outTensorName.find("type") != std::string::npos) { - type = OUT_MASKSCORES; - labels.clear(); - labels.push_back("No Mask"); - labels.push_back("Mask"); - shouldDetectMasks = true; - landmarkStd = 0.2f; - } else { - continue; - } - - size_t num = output.get_shape()[ov::layout::height_idx(outputLayout)]; - size_t i = 0; - for (; i < outputsSizes[type].size(); ++i) { - if (num < outputsSizes[type][i]) { - break; - } - } - separateoutputNames[type].insert(separateoutputNames[type].begin() + i, outTensorName); - outputsSizes[type].insert(outputsSizes[type].begin() + i, num); - } - model = ppp.build(); - - for (size_t idx = 0; idx < outputsSizes[OUT_BOXES].size(); ++idx) { - size_t width = outputsSizes[OUT_BOXES][idx]; - size_t height = outputsSizes[OUT_BOXES][idx]; - auto s = anchorCfg[idx].stride; - auto anchorNum = anchorsFpn[s].size(); - - anchors.push_back(std::vector(height * width * anchorNum)); - for (size_t iw = 0; iw < width; ++iw) { - size_t sw = iw * s; - for (size_t ih = 0; ih < height; ++ih) { - size_t sh = ih * s; - for (size_t k = 0; k < anchorNum; ++k) { - Anchor& anc = anchors[idx][(ih * width + iw) * anchorNum + k]; - anc.left = anchorsFpn[s][k].left + sw; - anc.top = anchorsFpn[s][k].top + sh; - anc.right = anchorsFpn[s][k].right + sw; - anc.bottom = anchorsFpn[s][k].bottom + sh; - } - } - } - } -} - -std::vector ratioEnum(const Anchor& anchor, const std::vector& ratios) { - std::vector retVal; - const auto w = anchor.getWidth(); - const auto h = anchor.getHeight(); - const auto xCtr = anchor.getXCenter(); - const auto yCtr = anchor.getYCenter(); - - for (const auto ratio : ratios) { - const auto size = w * h; - const auto sizeRatio = static_cast(size) / ratio; - const auto ws = sqrt(sizeRatio); - const auto hs = ws * ratio; - retVal.push_back({static_cast(xCtr - 0.5f * (ws - 1.0f)), - static_cast(yCtr - 0.5f * (hs - 1.0f)), - static_cast(xCtr + 0.5f * (ws - 1.0f)), - static_cast(yCtr + 0.5f * (hs - 1.0f))}); - } - return retVal; -} - -std::vector scaleEnum(const Anchor& anchor, const std::vector& scales) { - std::vector retVal; - const auto w = anchor.getWidth(); - const auto h = anchor.getHeight(); - const auto xCtr = anchor.getXCenter(); - const auto yCtr = anchor.getYCenter(); - - for (auto scale : scales) { - const auto ws = w * scale; - const auto hs = h * scale; - retVal.push_back({static_cast(xCtr - 0.5f * (ws - 1.0f)), - static_cast(yCtr - 0.5f * (hs - 1.0f)), - static_cast(xCtr + 0.5f * (ws - 1.0f)), - static_cast(yCtr + 0.5f * (hs - 1.0f))}); - } - return retVal; -} - -std::vector generateAnchors(const int baseSize, - const std::vector& ratios, - const std::vector& scales) { - Anchor baseAnchor{0.0f, 0.0f, baseSize - 1.0f, baseSize - 1.0f}; - auto ratioAnchors = ratioEnum(baseAnchor, ratios); - std::vector retVal; - - for (const auto& ra : ratioAnchors) { - auto addon = scaleEnum(ra, scales); - retVal.insert(retVal.end(), addon.begin(), addon.end()); - } - return retVal; -} - -void ModelRetinaFace::generateAnchorsFpn() { - auto cfg = anchorCfg; - std::sort(cfg.begin(), cfg.end(), [](const AnchorCfgLine& x, const AnchorCfgLine& y) { - return x.stride > y.stride; - }); - - for (const auto& cfgLine : cfg) { - anchorsFpn.emplace(cfgLine.stride, generateAnchors(cfgLine.baseSize, cfgLine.ratios, cfgLine.scales)); - } -} - -std::vector thresholding(const ov::Tensor& scoresTensor, const int anchorNum, const float confidence_threshold) { - std::vector indices; - indices.reserve(ModelRetinaFace::INIT_VECTOR_SIZE); - auto shape = scoresTensor.get_shape(); - size_t restAnchors = shape[1] - anchorNum; - const float* scoresPtr = scoresTensor.data(); - - for (size_t x = anchorNum; x < shape[1]; ++x) { - for (size_t y = 0; y < shape[2]; ++y) { - for (size_t z = 0; z < shape[3]; ++z) { - auto idx = (x * shape[2] + y) * shape[3] + z; - auto score = scoresPtr[idx]; - if (score >= confidence_threshold) { - indices.push_back((y * shape[3] + z) * restAnchors + (x - anchorNum)); - } - } - } - } - - return indices; -} - -void filterScores(std::vector& scores, - const std::vector& indices, - const ov::Tensor& scoresTensor, - const int anchorNum) { - const auto& shape = scoresTensor.get_shape(); - const float* scoresPtr = scoresTensor.data(); - const auto start = shape[2] * shape[3] * anchorNum; - - for (auto i : indices) { - auto offset = (i % anchorNum) * shape[2] * shape[3] + i / anchorNum; - scores.push_back(scoresPtr[start + offset]); - } -} - -void filterBoxes(std::vector& boxes, - const std::vector& indices, - const ov::Tensor& boxesTensor, - int anchorNum, - const std::vector& anchors) { - const auto& shape = boxesTensor.get_shape(); - const float* boxesPtr = boxesTensor.data(); - const auto boxPredLen = shape[1] / anchorNum; - const auto blockWidth = shape[2] * shape[3]; - - for (auto i : indices) { - auto offset = blockWidth * boxPredLen * (i % anchorNum) + (i / anchorNum); - - const auto dx = boxesPtr[offset]; - const auto dy = boxesPtr[offset + blockWidth]; - const auto dw = boxesPtr[offset + blockWidth * 2]; - const auto dh = boxesPtr[offset + blockWidth * 3]; - - const auto predCtrX = dx * anchors[i].getWidth() + anchors[i].getXCenter(); - const auto predCtrY = dy * anchors[i].getHeight() + anchors[i].getYCenter(); - const auto predW = exp(dw) * anchors[i].getWidth(); - const auto predH = exp(dh) * anchors[i].getHeight(); - - boxes.push_back({static_cast(predCtrX - 0.5f * (predW - 1.0f)), - static_cast(predCtrY - 0.5f * (predH - 1.0f)), - static_cast(predCtrX + 0.5f * (predW - 1.0f)), - static_cast(predCtrY + 0.5f * (predH - 1.0f))}); - } -} - -void filterLandmarks(std::vector& landmarks, - const std::vector& indices, - const ov::Tensor& landmarksTensor, - int anchorNum, - const std::vector& anchors, - const float landmarkStd) { - const auto& shape = landmarksTensor.get_shape(); - const float* landmarksPtr = landmarksTensor.data(); - const auto landmarkPredLen = shape[1] / anchorNum; - const auto blockWidth = shape[2] * shape[3]; - - for (auto i : indices) { - for (int j = 0; j < ModelRetinaFace::LANDMARKS_NUM; ++j) { - auto offset = (i % anchorNum) * landmarkPredLen * shape[2] * shape[3] + i / anchorNum; - auto deltaX = landmarksPtr[offset + j * 2 * blockWidth] * landmarkStd; - auto deltaY = landmarksPtr[offset + (j * 2 + 1) * blockWidth] * landmarkStd; - landmarks.push_back({deltaX * anchors[i].getWidth() + anchors[i].getXCenter(), - deltaY * anchors[i].getHeight() + anchors[i].getYCenter()}); - } - } -} - -void filterMasksScores(std::vector& masks, - const std::vector& indices, - const ov::Tensor& maskScoresTensor, - const int anchorNum) { - auto shape = maskScoresTensor.get_shape(); - const float* maskScoresPtr = maskScoresTensor.data(); - auto start = shape[2] * shape[3] * anchorNum * 2; - - for (auto i : indices) { - auto offset = (i % anchorNum) * shape[2] * shape[3] + i / anchorNum; - masks.push_back(maskScoresPtr[start + offset]); - } -} - -std::unique_ptr ModelRetinaFace::postprocess(InferenceResult& infResult) { - std::vector scores; - scores.reserve(INIT_VECTOR_SIZE); - std::vector boxes; - boxes.reserve(INIT_VECTOR_SIZE); - std::vector landmarks; - std::vector masks; - - if (shouldDetectLandmarks) { - landmarks.reserve(INIT_VECTOR_SIZE); - } - if (shouldDetectMasks) { - masks.reserve(INIT_VECTOR_SIZE); - } - - // --------------------------- Gather & Filter output from all levels - // ---------------------------------------------------------- - for (size_t idx = 0; idx < anchorCfg.size(); ++idx) { - const auto boxRaw = infResult.outputsData[separateoutputNames[OUT_BOXES][idx]]; - const auto scoresRaw = infResult.outputsData[separateoutputNames[OUT_SCORES][idx]]; - auto s = anchorCfg[idx].stride; - auto anchorNum = anchorsFpn[s].size(); - - auto validIndices = thresholding(scoresRaw, anchorNum, confidence_threshold); - filterScores(scores, validIndices, scoresRaw, anchorNum); - filterBoxes(boxes, validIndices, boxRaw, anchorNum, anchors[idx]); - if (shouldDetectLandmarks) { - const auto landmarksRaw = infResult.outputsData[separateoutputNames[OUT_LANDMARKS][idx]]; - filterLandmarks(landmarks, validIndices, landmarksRaw, anchorNum, anchors[idx], landmarkStd); - } - if (shouldDetectMasks) { - const auto masksRaw = infResult.outputsData[separateoutputNames[OUT_MASKSCORES][idx]]; - filterMasksScores(masks, validIndices, masksRaw, anchorNum); - } - } - // --------------------------- Apply Non-maximum Suppression - // ---------------------------------------------------------- !shouldDetectLandmarks determines nms behavior, if - // true - boundaries are included in areas calculation - const auto keep = nms(boxes, scores, iou_threshold, !shouldDetectLandmarks); - - // --------------------------- Create detection result objects - // -------------------------------------------------------- - RetinaFaceDetectionResult* result = new RetinaFaceDetectionResult(infResult.frameId, infResult.metaData); - - const auto imgWidth = infResult.internalModelData->asRef().inputImgWidth; - const auto imgHeight = infResult.internalModelData->asRef().inputImgHeight; - const auto scaleX = static_cast(netInputWidth) / imgWidth; - const auto scaleY = static_cast(netInputHeight) / imgHeight; - - result->objects.reserve(keep.size()); - result->landmarks.reserve(keep.size() * ModelRetinaFace::LANDMARKS_NUM); - for (auto i : keep) { - DetectedObject desc; - desc.confidence = scores[i]; - //--- Scaling coordinates - boxes[i].left /= scaleX; - boxes[i].top /= scaleY; - boxes[i].right /= scaleX; - boxes[i].bottom /= scaleY; - - desc.x = clamp(boxes[i].left, 0.f, static_cast(imgWidth)); - desc.y = clamp(boxes[i].top, 0.f, static_cast(imgHeight)); - desc.width = clamp(boxes[i].getWidth(), 0.f, static_cast(imgWidth)); - desc.height = clamp(boxes[i].getHeight(), 0.f, static_cast(imgHeight)); - //--- Default label 0 - Face. If detecting masks then labels would be 0 - No Mask, 1 - Mask - desc.labelID = shouldDetectMasks ? (masks[i] > maskThreshold) : 0; - desc.label = labels[desc.labelID]; - result->objects.push_back(desc); - - //--- Scaling landmarks coordinates - for (size_t l = 0; l < ModelRetinaFace::LANDMARKS_NUM && shouldDetectLandmarks; ++l) { - landmarks[i * ModelRetinaFace::LANDMARKS_NUM + l].x = - clamp(landmarks[i * ModelRetinaFace::LANDMARKS_NUM + l].x / scaleX, 0.f, static_cast(imgWidth)); - landmarks[i * ModelRetinaFace::LANDMARKS_NUM + l].y = - clamp(landmarks[i * ModelRetinaFace::LANDMARKS_NUM + l].y / scaleY, 0.f, static_cast(imgHeight)); - result->landmarks.push_back(landmarks[i * ModelRetinaFace::LANDMARKS_NUM + l]); - } - } - - return std::unique_ptr(result); -} diff --git a/model_api/cpp/models/src/detection_model_retinaface_pt.cpp b/model_api/cpp/models/src/detection_model_retinaface_pt.cpp deleted file mode 100644 index e61aa92a..00000000 --- a/model_api/cpp/models/src/detection_model_retinaface_pt.cpp +++ /dev/null @@ -1,296 +0,0 @@ -/* -// Copyright (C) 2021-2024 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "models/detection_model_retinaface_pt.h" - -#include - -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include - -#include "models/internal_model_data.h" -#include "models/results.h" - -std::string ModelRetinaFacePT::ModelType = "retinaface-pytorch"; - -void ModelRetinaFacePT::initDefaultParameters(const ov::AnyMap& configuration) { - resizeMode = RESIZE_FILL; // Ignore resize_type for now - auto labels_string = configuration.find("labels"); // Override default if it is not set - if (labels_string == configuration.end()) { - labels = {"Face"}; - } -} - -ModelRetinaFacePT::ModelRetinaFacePT(std::shared_ptr& model, const ov::AnyMap& configuration) - : DetectionModelExt(model, configuration) { - initDefaultParameters(configuration); -} - -ModelRetinaFacePT::ModelRetinaFacePT(std::shared_ptr& adapter) - : DetectionModelExt(adapter) { - const ov::AnyMap& configuration = adapter->getModelConfig(); - initDefaultParameters(configuration); -} - -void ModelRetinaFacePT::updateModelInfo() { - DetectionModelExt::updateModelInfo(); - - model->set_rt_info(ModelRetinaFacePT::ModelType, "model_info", "model_type"); -} - - -void ModelRetinaFacePT::prepareInputsOutputs(std::shared_ptr& model) { - // --------------------------- Configure input & output ------------------------------------------------- - // --------------------------- Prepare input ------------------------------------------------------ - if (model->inputs().size() != 1) { - throw std::logic_error("RetinaFacePT model wrapper expects models that have only 1 input"); - } - - const ov::Shape& inputShape = model->input().get_shape(); - const ov::Layout& inputLayout = getInputLayout(model->input()); - - if (inputShape[ov::layout::channels_idx(inputLayout)] != 3) { - throw std::logic_error("Expected 3-channel input"); - } - - ov::preprocess::PrePostProcessor ppp(model); - inputTransform.setPrecision(ppp, model->input().get_any_name()); - ppp.input().tensor().set_layout({"NHWC"}); - - if (useAutoResize) { - ppp.input().tensor().set_spatial_dynamic_shape(); - - ppp.input() - .preprocess() - .convert_element_type(ov::element::f32) - .resize(ov::preprocess::ResizeAlgorithm::RESIZE_LINEAR); - } - - ppp.input().model().set_layout(inputLayout); - - // --------------------------- Reading image input parameters ------------------------------------------- - inputNames.push_back(model->input().get_any_name()); - netInputWidth = inputShape[ov::layout::width_idx(inputLayout)]; - netInputHeight = inputShape[ov::layout::height_idx(inputLayout)]; - - // --------------------------- Prepare output ----------------------------------------------------- - if (model->outputs().size() != 3) { - throw std::logic_error("RetinaFace model wrapper expects models that have 3 outputs"); - } - - landmarksNum = 0; - - outputNames.resize(2); - std::vector outputsSizes[OUT_MAX]; - const ov::Layout chw("CHW"); - const ov::Layout nchw("NCHW"); - for (auto& output : model->outputs()) { - auto outTensorName = output.get_any_name(); - outputNames.push_back(outTensorName); - ppp.output(outTensorName) - .tensor() - .set_element_type(ov::element::f32) - .set_layout(output.get_shape().size() == 4 ? nchw : chw); - - if (outTensorName.find("bbox") != std::string::npos) { - outputNames[OUT_BOXES] = outTensorName; - } else if (outTensorName.find("cls") != std::string::npos) { - outputNames[OUT_SCORES] = outTensorName; - } else if (outTensorName.find("landmark") != std::string::npos) { - // Landmarks might be optional, if it is present, resize names array to fit landmarks output name to the - // last item of array Considering that other outputs names are already filled in or will be filled later - outputNames.resize(std::max(outputNames.size(), (size_t)OUT_LANDMARKS + 1)); - outputNames[OUT_LANDMARKS] = outTensorName; - landmarksNum = - output.get_shape()[ov::layout::width_idx(chw)] / 2; // Each landmark consist of 2 variables (x and y) - } else { - continue; - } - } - - if (outputNames[OUT_BOXES] == "" || outputNames[OUT_SCORES] == "") { - throw std::logic_error("Bbox or cls layers are not found"); - } - - model = ppp.build(); - priors = generatePriorData(); -} - -std::vector ModelRetinaFacePT::filterByScore(const ov::Tensor& scoresTensor, const float confidence_threshold) { - std::vector indicies; - const auto& shape = scoresTensor.get_shape(); - const float* scoresPtr = scoresTensor.data(); - - for (size_t x = 0; x < shape[1]; ++x) { - const auto idx = (x * shape[2] + 1); - const auto score = scoresPtr[idx]; - if (score >= confidence_threshold) { - indicies.push_back(x); - } - } - - return indicies; -} - -std::vector ModelRetinaFacePT::getFilteredScores(const ov::Tensor& scoresTensor, - const std::vector& indicies) { - const auto& shape = scoresTensor.get_shape(); - const float* scoresPtr = scoresTensor.data(); - - std::vector scores; - scores.reserve(indicies.size()); - - for (auto i : indicies) { - scores.push_back(scoresPtr[i * shape[2] + 1]); - } - return scores; -} - -std::vector ModelRetinaFacePT::getFilteredLandmarks(const ov::Tensor& landmarksTensor, - const std::vector& indicies, - int imgWidth, - int imgHeight) { - const auto& shape = landmarksTensor.get_shape(); - const float* landmarksPtr = landmarksTensor.data(); - - std::vector landmarks(landmarksNum * indicies.size()); - - for (size_t i = 0; i < indicies.size(); i++) { - const size_t idx = indicies[i]; - const auto& prior = priors[idx]; - for (size_t j = 0; j < landmarksNum; j++) { - landmarks[i * landmarksNum + j].x = - clamp(prior.cX + landmarksPtr[idx * shape[2] + j * 2] * variance[0] * prior.width, 0.f, 1.f) * imgWidth; - landmarks[i * landmarksNum + j].y = - clamp(prior.cY + landmarksPtr[idx * shape[2] + j * 2 + 1] * variance[0] * prior.height, 0.f, 1.f) * - imgHeight; - } - } - return landmarks; -} - -std::vector ModelRetinaFacePT::generatePriorData() { - const float globalMinSizes[][2] = {{16, 32}, {64, 128}, {256, 512}}; - const float steps[] = {8., 16., 32.}; - std::vector anchors; - for (size_t stepNum = 0; stepNum < arraySize(steps); stepNum++) { - const int featureW = static_cast(std::round(netInputWidth / steps[stepNum])); - const int featureH = static_cast(std::round(netInputHeight / steps[stepNum])); - - const auto& minSizes = globalMinSizes[stepNum]; - for (int i = 0; i < featureH; i++) { - for (int j = 0; j < featureW; j++) { - for (auto minSize : minSizes) { - const float sKX = minSize / netInputWidth; - const float sKY = minSize / netInputHeight; - const float denseCY = (i + 0.5f) * steps[stepNum] / netInputHeight; - const float denseCX = (j + 0.5f) * steps[stepNum] / netInputWidth; - anchors.push_back(ModelRetinaFacePT::Box{denseCX, denseCY, sKX, sKY}); - } - } - } - } - return anchors; -} - -std::vector ModelRetinaFacePT::getFilteredProposals(const ov::Tensor& boxesTensor, - const std::vector& indicies, - int imgWidth, - int imgHeight) { - std::vector rects; - rects.reserve(indicies.size()); - - const auto& shape = boxesTensor.get_shape(); - const float* boxesPtr = boxesTensor.data(); - - if (shape[1] != priors.size()) { - throw std::logic_error("rawBoxes size is not equal to priors size"); - } - - for (auto i : indicies) { - const auto pRawBox = reinterpret_cast(boxesPtr + i * shape[2]); - const auto& prior = priors[i]; - const float cX = priors[i].cX + pRawBox->cX * variance[0] * prior.width; - const float cY = priors[i].cY + pRawBox->cY * variance[0] * prior.height; - const float width = prior.width * exp(pRawBox->width * variance[1]); - const float height = prior.height * exp(pRawBox->height * variance[1]); - rects.push_back(Anchor{clamp(cX - width / 2, 0.f, 1.f) * imgWidth, - clamp(cY - height / 2, 0.f, 1.f) * imgHeight, - clamp(cX + width / 2, 0.f, 1.f) * imgWidth, - clamp(cY + height / 2, 0.f, 1.f) * imgHeight}); - } - - return rects; -} - -std::unique_ptr ModelRetinaFacePT::postprocess(InferenceResult& infResult) { - // (raw_output, scale_x, scale_y, face_prob_threshold, image_size): - const auto boxesTensor = infResult.outputsData[outputNames[OUT_BOXES]]; - const auto scoresTensor = infResult.outputsData[outputNames[OUT_SCORES]]; - - const auto& validIndicies = filterByScore(scoresTensor, confidence_threshold); - const auto& scores = getFilteredScores(scoresTensor, validIndicies); - - const auto& internalData = infResult.internalModelData->asRef(); - const auto& landmarks = landmarksNum ? getFilteredLandmarks(infResult.outputsData[outputNames[OUT_LANDMARKS]], - validIndicies, - internalData.inputImgWidth, - internalData.inputImgHeight) - : std::vector(); - - const auto& proposals = - getFilteredProposals(boxesTensor, validIndicies, internalData.inputImgWidth, internalData.inputImgHeight); - - const auto& keptIndicies = nms(proposals, scores, iou_threshold, !landmarksNum); - - // --------------------------- Create detection result objects - // -------------------------------------------------------- - RetinaFaceDetectionResult* result = new RetinaFaceDetectionResult(infResult.frameId, infResult.metaData); - - result->objects.reserve(keptIndicies.size()); - result->landmarks.reserve(keptIndicies.size() * landmarksNum); - for (auto i : keptIndicies) { - DetectedObject desc; - desc.confidence = scores[i]; - - //--- Scaling coordinates - desc.x = proposals[i].left; - desc.y = proposals[i].top; - desc.width = proposals[i].getWidth(); - desc.height = proposals[i].getHeight(); - - desc.labelID = 0; - desc.label = labels[desc.labelID]; - result->objects.push_back(desc); - - //--- Filtering landmarks coordinates - for (uint32_t l = 0; l < landmarksNum; ++l) { - result->landmarks.emplace_back(landmarks[i * landmarksNum + l].x, landmarks[i * landmarksNum + l].y); - } - } - - return std::unique_ptr(result); -} diff --git a/model_api/cpp/models/src/hpe_model_associative_embedding.cpp b/model_api/cpp/models/src/hpe_model_associative_embedding.cpp deleted file mode 100644 index 4f3b8c56..00000000 --- a/model_api/cpp/models/src/hpe_model_associative_embedding.cpp +++ /dev/null @@ -1,263 +0,0 @@ -/* -// Copyright (C) 2021-2024 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "models/hpe_model_associative_embedding.h" - -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include - -#include "models/associative_embedding_decoder.h" -#include "models/input_data.h" -#include "models/internal_model_data.h" -#include "models/results.h" - -const cv::Vec3f HpeAssociativeEmbedding::meanPixel = cv::Vec3f::all(128); -const float HpeAssociativeEmbedding::detectionThreshold = 0.1f; -const float HpeAssociativeEmbedding::tagThreshold = 1.0f; - -HpeAssociativeEmbedding::HpeAssociativeEmbedding(const std::string& modelFile, - double aspectRatio, - int targetSize, - float confidence_threshold, - const std::string& layout, - float delta, - const std::string& resize_type) - : ImageModel(modelFile, resize_type, false, layout), - aspectRatio(aspectRatio), - targetSize(targetSize), - confidence_threshold(confidence_threshold), - delta(delta) { - interpolationMode = cv::INTER_CUBIC; - } - -void HpeAssociativeEmbedding::prepareInputsOutputs(std::shared_ptr& model) { - // --------------------------- Configure input & output ------------------------------------------------- - // --------------------------- Prepare input Tensors ------------------------------------------------------ - if (model->inputs().size() != 1) { - throw std::logic_error("HPE AE model wrapper supports topologies with only 1 input."); - } - inputNames.push_back(model->input().get_any_name()); - - const ov::Shape& inputShape = model->input().get_shape(); - const ov::Layout& inputLayout = getInputLayout(model->input()); - - if (inputShape.size() != 4 || inputShape[ov::layout::batch_idx(inputLayout)] != 1 || - inputShape[ov::layout::channels_idx(inputLayout)] != 3) { - throw std::logic_error("3-channel 4-dimensional model's input is expected"); - } - - ov::preprocess::PrePostProcessor ppp(model); - ppp.input().tensor().set_element_type(ov::element::u8).set_layout({"NHWC"}); - - ppp.input().model().set_layout(inputLayout); - - // --------------------------- Prepare output Tensors ----------------------------------------------------- - const ov::OutputVector& outputs = model->outputs(); - if (outputs.size() != 2 && outputs.size() != 3) { - throw std::logic_error("HPE AE model model wrapper supports topologies only with 2 or 3 outputs"); - } - - for (const auto& output : model->outputs()) { - const auto& outTensorName = output.get_any_name(); - ppp.output(outTensorName).tensor().set_element_type(ov::element::f32); - - for (const auto& name : output.get_names()) { - outputNames.push_back(name); - } - - const ov::Shape& outputShape = output.get_shape(); - if (outputShape.size() != 4 && outputShape.size() != 5) { - throw std::logic_error("output tensors are expected to be 4-dimensional or 5-dimensional"); - } - if (outputShape[ov::layout::batch_idx("NC...")] != 1 || outputShape[ov::layout::channels_idx("NC...")] != 17) { - throw std::logic_error("output tensors are expected to have 1 batch size and 17 channels"); - } - } - model = ppp.build(); - - embeddingsTensorName = findTensorByName("embeddings", outputNames); - heatmapsTensorName = findTensorByName("heatmaps", outputNames); - try { - nmsHeatmapsTensorName = findTensorByName("nms_heatmaps", outputNames); - } catch (const std::runtime_error&) { nmsHeatmapsTensorName = heatmapsTensorName; } - - changeInputSize(model); -} - -void HpeAssociativeEmbedding::changeInputSize(std::shared_ptr& model) { - ov::Shape inputShape = model->input().get_shape(); - const ov::Layout& layout = ov::layout::get_layout(model->input()); - const auto batchId = ov::layout::batch_idx(layout); - const auto heightId = ov::layout::height_idx(layout); - const auto widthId = ov::layout::width_idx(layout); - - if (!targetSize) { - targetSize = static_cast(std::min(inputShape[heightId], inputShape[widthId])); - } - int inputHeight = aspectRatio >= 1.0 ? targetSize : static_cast(std::round(targetSize / aspectRatio)); - int inputWidth = aspectRatio >= 1.0 ? static_cast(std::round(targetSize * aspectRatio)) : targetSize; - int height = static_cast((inputHeight + stride - 1) / stride) * stride; - int width = static_cast((inputWidth + stride - 1) / stride) * stride; - inputShape[batchId] = 1; - inputShape[heightId] = height; - inputShape[widthId] = width; - inputLayerSize = cv::Size(width, height); - - model->reshape(inputShape); -} - -std::shared_ptr HpeAssociativeEmbedding::preprocess(const InputData& inputData, - InferenceInput& input) { - auto& image = inputData.asRef().inputImage; - cv::Rect roi; - auto paddedImage = resizeImageExt(image, inputLayerSize.width, inputLayerSize.height, resizeMode, interpolationMode, &roi); - if (inputLayerSize.height - stride >= roi.height || inputLayerSize.width - stride >= roi.width) { - slog::warn << "\tChosen model aspect ratio doesn't match image aspect ratio" << slog::endl; - } - input.emplace(inputNames[0], wrapMat2Tensor(paddedImage)); - - return std::make_shared(paddedImage.cols, - paddedImage.rows, - image.size().width / static_cast(roi.width), - image.size().height / static_cast(roi.height)); -} - -std::unique_ptr HpeAssociativeEmbedding::postprocess(InferenceResult& infResult) { - HumanPoseResult* result = new HumanPoseResult(infResult.frameId, infResult.metaData); - - const auto& aembds = infResult.outputsData[embeddingsTensorName]; - const ov::Shape& aembdsShape = aembds.get_shape(); - float* const aembdsMapped = aembds.data(); - std::vector aembdsMaps = split(aembdsMapped, aembdsShape); - - const auto& heats = infResult.outputsData[heatmapsTensorName]; - const ov::Shape& heatMapsShape = heats.get_shape(); - float* const heatMapsMapped = heats.data(); - std::vector heatMaps = split(heatMapsMapped, heatMapsShape); - - std::vector nmsHeatMaps = heatMaps; - if (nmsHeatmapsTensorName != heatmapsTensorName) { - const auto& nmsHeats = infResult.outputsData[nmsHeatmapsTensorName]; - const ov::Shape& nmsHeatMapsShape = nmsHeats.get_shape(); - float* const nmsHeatMapsMapped = nmsHeats.data(); - nmsHeatMaps = split(nmsHeatMapsMapped, nmsHeatMapsShape); - } - std::vector poses = extractPoses(heatMaps, aembdsMaps, nmsHeatMaps); - - // Rescale poses to the original image - const auto& scale = infResult.internalModelData->asRef(); - const float outputScale = inputLayerSize.width / static_cast(heatMapsShape[3]); - float shiftX = 0.0, shiftY = 0.0; - float scaleX = 1.0, scaleY = 1.0; - - if (resizeMode == RESIZE_KEEP_ASPECT_LETTERBOX) { - scaleX = scaleY = std::min(scale.scaleX, scale.scaleY); - if (aspectRatio >= 1.0) - shiftX = static_cast((targetSize * scaleX * aspectRatio - scale.inputImgWidth * scaleX) / 2); - else - shiftY = static_cast((targetSize * scaleY / aspectRatio - scale.inputImgHeight * scaleY) / 2); - scaleX = scaleY *= outputScale; - } else { - scaleX = scale.scaleX * outputScale; - scaleY = scale.scaleY * outputScale; - } - - for (auto& pose : poses) { - for (auto& keypoint : pose.keypoints) { - if (keypoint != cv::Point2f(-1, -1)) { - keypoint.x = keypoint.x * scaleX + shiftX; - keypoint.y = keypoint.y * scaleY + shiftY; - } - } - result->poses.push_back(pose); - } - - return std::unique_ptr(result); -} - -std::string HpeAssociativeEmbedding::findTensorByName(const std::string& tensorName, - const std::vector& outputNames) { - std::vector suitableLayers; - for (auto& outputName : outputNames) { - if (outputName.rfind(tensorName, 0) == 0) { - suitableLayers.push_back(outputName); - } - } - if (suitableLayers.empty()) { - throw std::runtime_error("Suitable tensor for " + tensorName + " output is not found"); - } else if (suitableLayers.size() > 1) { - throw std::runtime_error("More than 1 tensor matched to " + tensorName + " output"); - } - return suitableLayers[0]; -} - -std::vector HpeAssociativeEmbedding::split(float* data, const ov::Shape& shape) { - std::vector flattenData(shape[1]); - for (size_t i = 0; i < flattenData.size(); i++) { - flattenData[i] = cv::Mat(shape[2], shape[3], CV_32FC1, data + i * shape[2] * shape[3]); - } - return flattenData; -} - -std::vector HpeAssociativeEmbedding::extractPoses(std::vector& heatMaps, - const std::vector& aembdsMaps, - const std::vector& nmsHeatMaps) const { - std::vector> allPeaks(numJoints); - for (int i = 0; i < numJoints; i++) { - findPeaks(nmsHeatMaps, aembdsMaps, allPeaks, i, maxNumPeople, detectionThreshold); - } - std::vector allPoses = matchByTag(allPeaks, maxNumPeople, numJoints, tagThreshold); - // swap for all poses - for (auto& pose : allPoses) { - for (size_t j = 0; j < numJoints; j++) { - Peak& peak = pose.getPeak(j); - std::swap(peak.keypoint.x, peak.keypoint.y); - } - } - std::vector poses; - for (size_t i = 0; i < allPoses.size(); i++) { - Pose& pose = allPoses[i]; - // Filtering poses with low mean scores - if (pose.getMeanScore() <= confidence_threshold) { - continue; - } - for (size_t j = 0; j < heatMaps.size(); j++) { - heatMaps[j] = cv::abs(heatMaps[j]); - } - adjustAndRefine(allPoses, heatMaps, aembdsMaps, i, delta); - std::vector keypoints; - for (size_t j = 0; j < numJoints; j++) { - Peak& peak = pose.getPeak(j); - keypoints.push_back(peak.keypoint); - } - poses.push_back({keypoints, pose.getMeanScore()}); - } - return poses; -} diff --git a/model_api/cpp/models/src/hpe_model_openpose.cpp b/model_api/cpp/models/src/hpe_model_openpose.cpp deleted file mode 100644 index 74a9a186..00000000 --- a/model_api/cpp/models/src/hpe_model_openpose.cpp +++ /dev/null @@ -1,256 +0,0 @@ -/* -// Copyright (C) 2020-2024 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "models/hpe_model_openpose.h" - -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include - -#include "models/input_data.h" -#include "models/internal_model_data.h" -#include "models/openpose_decoder.h" -#include "models/results.h" - -const cv::Vec3f HPEOpenPose::meanPixel = cv::Vec3f::all(128); -const float HPEOpenPose::minPeaksDistance = 3.0f; -const float HPEOpenPose::midPointsScoreThreshold = 0.05f; -const float HPEOpenPose::foundMidPointsRatioThreshold = 0.8f; -const float HPEOpenPose::minSubsetScore = 0.2f; - -HPEOpenPose::HPEOpenPose(const std::string& modelFile, - double aspectRatio, - int targetSize, - float confidence_threshold, - const std::string& layout) - : ImageModel(modelFile, "fit_to_window", false, layout), - aspectRatio(aspectRatio), - targetSize(targetSize), - confidence_threshold(confidence_threshold) { - interpolationMode = cv::INTER_CUBIC; - } - -void HPEOpenPose::prepareInputsOutputs(std::shared_ptr& model) { - // --------------------------- Configure input & output ------------------------------------------------- - // --------------------------- Prepare input ------------------------------------------------------ - if (model->inputs().size() != 1) { - throw std::logic_error("HPE OpenPose model wrapper supports topologies with only 1 input"); - } - inputNames.push_back(model->input().get_any_name()); - const ov::Shape& inputShape = model->input().get_shape(); - const ov::Layout& inputLayout = getInputLayout(model->input()); - - if (inputShape.size() != 4 || inputShape[ov::layout::batch_idx(inputLayout)] != 1 || - inputShape[ov::layout::channels_idx(inputLayout)] != 3) - throw std::logic_error("3-channel 4-dimensional model's input is expected"); - - ov::preprocess::PrePostProcessor ppp(model); - ppp.input().tensor().set_element_type(ov::element::u8).set_layout({"NHWC"}); - - ppp.input().model().set_layout(inputLayout); - - // --------------------------- Prepare output ----------------------------------------------------- - const ov::OutputVector& outputs = model->outputs(); - if (outputs.size() != 2) { - throw std::runtime_error("HPE OpenPose supports topologies with only 2 outputs"); - } - - const ov::Layout outputLayout("NCHW"); - for (const auto& output : model->outputs()) { - const auto& outTensorName = output.get_any_name(); - ppp.output(outTensorName).tensor().set_element_type(ov::element::f32).set_layout(outputLayout); - outputNames.push_back(outTensorName); - } - model = ppp.build(); - - const size_t batchId = ov::layout::batch_idx(outputLayout); - const size_t channelsId = ov::layout::channels_idx(outputLayout); - const size_t widthId = ov::layout::width_idx(outputLayout); - const size_t heightId = ov::layout::height_idx(outputLayout); - - ov::Shape heatmapsOutputShape = model->outputs().front().get_shape(); - ov::Shape pafsOutputShape = model->outputs().back().get_shape(); - if (heatmapsOutputShape[channelsId] > pafsOutputShape[channelsId]) { - std::swap(heatmapsOutputShape, pafsOutputShape); - std::swap(outputNames[0], outputNames[1]); - } - - if (heatmapsOutputShape.size() != 4 || heatmapsOutputShape[batchId] != 1 || - heatmapsOutputShape[ov::layout::channels_idx(outputLayout)] != keypointsNumber + 1) { - throw std::logic_error("1x" + std::to_string(keypointsNumber + 1) + - "xHFMxWFM dimension of model's heatmap is expected"); - } - if (pafsOutputShape.size() != 4 || pafsOutputShape[batchId] != 1 || - pafsOutputShape[channelsId] != 2 * (keypointsNumber + 1)) { - throw std::logic_error("1x" + std::to_string(2 * (keypointsNumber + 1)) + - "xHFMxWFM dimension of model's output is expected"); - } - if (pafsOutputShape[heightId] != heatmapsOutputShape[heightId] || - pafsOutputShape[widthId] != heatmapsOutputShape[widthId]) { - throw std::logic_error("output and heatmap are expected to have matching last two dimensions"); - } - - changeInputSize(model); -} - -void HPEOpenPose::changeInputSize(std::shared_ptr& model) { - ov::Shape inputShape = model->input().get_shape(); - const ov::Layout& layout = ov::layout::get_layout(model->inputs().front()); - const auto batchId = ov::layout::batch_idx(layout); - const auto heightId = ov::layout::height_idx(layout); - const auto widthId = ov::layout::width_idx(layout); - - if (!targetSize) { - targetSize = inputShape[heightId]; - } - int height = static_cast((targetSize + stride - 1) / stride) * stride; - int inputWidth = static_cast(std::round(targetSize * aspectRatio)); - int width = static_cast((inputWidth + stride - 1) / stride) * stride; - inputShape[batchId] = 1; - inputShape[heightId] = height; - inputShape[widthId] = width; - inputLayerSize = cv::Size(width, height); - model->reshape(inputShape); -} - -std::shared_ptr HPEOpenPose::preprocess(const InputData& inputData, InferenceInput& input) { - auto& image = inputData.asRef().inputImage; - cv::Rect roi; - auto paddedImage = - resizeImageExt(image, inputLayerSize.width, inputLayerSize.height, resizeMode, interpolationMode, &roi); - if (inputLayerSize.width < roi.width) - throw std::runtime_error("The image aspect ratio doesn't fit current model shape"); - - if (inputLayerSize.width - stride >= roi.width) { - slog::warn << "\tChosen model aspect ratio doesn't match image aspect ratio" << slog::endl; - } - - input.emplace(inputNames[0], wrapMat2Tensor(paddedImage)); - - return std::make_shared(paddedImage.cols, - paddedImage.rows, - image.cols / static_cast(roi.width), - image.rows / static_cast(roi.height)); -} - -std::unique_ptr HPEOpenPose::postprocess(InferenceResult& infResult) { - HumanPoseResult* result = new HumanPoseResult(infResult.frameId, infResult.metaData); - - const auto& heatMapsMapped = infResult.outputsData[outputNames[0]]; - const auto& outputMapped = infResult.outputsData[outputNames[1]]; - - const ov::Shape& outputShape = outputMapped.get_shape(); - const ov::Shape& heatMapShape = heatMapsMapped.get_shape(); - - float* const predictions = outputMapped.data(); - float* const heats = heatMapsMapped.data(); - - std::vector heatMaps(keypointsNumber); - for (size_t i = 0; i < heatMaps.size(); i++) { - heatMaps[i] = - cv::Mat(heatMapShape[2], heatMapShape[3], CV_32FC1, heats + i * heatMapShape[2] * heatMapShape[3]); - } - resizeFeatureMaps(heatMaps); - - std::vector pafs(outputShape[1]); - for (size_t i = 0; i < pafs.size(); i++) { - pafs[i] = - cv::Mat(heatMapShape[2], heatMapShape[3], CV_32FC1, predictions + i * heatMapShape[2] * heatMapShape[3]); - } - resizeFeatureMaps(pafs); - - std::vector poses = extractPoses(heatMaps, pafs); - - const auto& scale = infResult.internalModelData->asRef(); - float scaleX = stride / upsampleRatio * scale.scaleX; - float scaleY = stride / upsampleRatio * scale.scaleY; - for (auto& pose : poses) { - for (auto& keypoint : pose.keypoints) { - if (keypoint != cv::Point2f(-1, -1)) { - keypoint.x *= scaleX; - keypoint.y *= scaleY; - } - } - } - for (size_t i = 0; i < poses.size(); ++i) { - result->poses.push_back(poses[i]); - } - - return std::unique_ptr(result); -} - -void HPEOpenPose::resizeFeatureMaps(std::vector& featureMaps) const { - for (auto& featureMap : featureMaps) { - cv::resize(featureMap, featureMap, cv::Size(), upsampleRatio, upsampleRatio, cv::INTER_CUBIC); - } -} - -class FindPeaksBody : public cv::ParallelLoopBody { -public: - FindPeaksBody(const std::vector& heatMaps, - float minPeaksDistance, - std::vector>& peaksFromHeatMap, - float confidence_threshold) - : heatMaps(heatMaps), - minPeaksDistance(minPeaksDistance), - peaksFromHeatMap(peaksFromHeatMap), - confidence_threshold(confidence_threshold) {} - - void operator()(const cv::Range& range) const override { - for (int i = range.start; i < range.end; i++) { - findPeaks(heatMaps, minPeaksDistance, peaksFromHeatMap, i, confidence_threshold); - } - } - -private: - const std::vector& heatMaps; - float minPeaksDistance; - std::vector>& peaksFromHeatMap; - float confidence_threshold; -}; - -std::vector HPEOpenPose::extractPoses(const std::vector& heatMaps, - const std::vector& pafs) const { - std::vector> peaksFromHeatMap(heatMaps.size()); - FindPeaksBody findPeaksBody(heatMaps, minPeaksDistance, peaksFromHeatMap, confidence_threshold); - cv::parallel_for_(cv::Range(0, static_cast(heatMaps.size())), findPeaksBody); - int peaksBefore = 0; - for (size_t heatmapId = 1; heatmapId < heatMaps.size(); heatmapId++) { - peaksBefore += static_cast(peaksFromHeatMap[heatmapId - 1].size()); - for (auto& peak : peaksFromHeatMap[heatmapId]) { - peak.id += peaksBefore; - } - } - std::vector poses = groupPeaksToPoses(peaksFromHeatMap, - pafs, - keypointsNumber, - midPointsScoreThreshold, - foundMidPointsRatioThreshold, - minJointsNumber, - minSubsetScore); - return poses; -} diff --git a/model_api/cpp/models/src/jpeg_restoration_model.cpp b/model_api/cpp/models/src/jpeg_restoration_model.cpp deleted file mode 100644 index 4dc55a39..00000000 --- a/model_api/cpp/models/src/jpeg_restoration_model.cpp +++ /dev/null @@ -1,167 +0,0 @@ -/* -// Copyright (C) 2021-2024 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include - -#include "models/image_model.h" -#include "models/input_data.h" -#include "models/internal_model_data.h" -#include "models/jpeg_restoration_model.h" -#include "models/results.h" - -JPEGRestorationModel::JPEGRestorationModel(const std::string& modelFile, - const cv::Size& inputImgSize, - bool _jpegCompression, - const std::string& layout) - : ImageModel(modelFile, "standard", false, layout) { - netInputHeight = inputImgSize.height; - netInputWidth = inputImgSize.width; - jpegCompression = _jpegCompression; -} - -void JPEGRestorationModel::prepareInputsOutputs(std::shared_ptr& model) { - // --------------------------- Configure input & output ------------------------------------------------- - // --------------------------- Prepare input ------------------------------------------------------ - if (model->inputs().size() != 1) { - throw std::logic_error("The JPEG Restoration model wrapper supports topologies with only 1 input"); - } - inputNames.push_back(model->input().get_any_name()); - - const ov::Shape& inputShape = model->input().get_shape(); - const ov::Layout& inputLayout = getInputLayout(model->input()); - - if (inputShape.size() != 4 || inputShape[ov::layout::batch_idx(inputLayout)] != 1 || - inputShape[ov::layout::channels_idx(inputLayout)] != 3) { - throw std::logic_error("3-channel 4-dimensional model's input is expected"); - } - - ov::preprocess::PrePostProcessor ppp(model); - ppp.input().tensor().set_element_type(ov::element::u8).set_layout("NHWC"); - - ppp.input().model().set_layout(inputLayout); - - // --------------------------- Prepare output ----------------------------------------------------- - const ov::OutputVector& outputs = model->outputs(); - if (outputs.size() != 1) { - throw std::logic_error("The JPEG Restoration model wrapper supports topologies with only 1 output"); - } - const ov::Shape& outputShape = model->output().get_shape(); - const ov::Layout outputLayout{"NCHW"}; - if (outputShape.size() != 4 || outputShape[ov::layout::batch_idx(outputLayout)] != 1 || - outputShape[ov::layout::channels_idx(outputLayout)] != 3) { - throw std::logic_error("3-channel 4-dimensional model's output is expected"); - } - - outputNames.push_back(model->output().get_any_name()); - ppp.output().tensor().set_element_type(ov::element::f32); - model = ppp.build(); - - changeInputSize(model); -} - -void JPEGRestorationModel::changeInputSize(std::shared_ptr& model) { - ov::Shape inputShape = model->input().get_shape(); - const ov::Layout& layout = ov::layout::get_layout(model->input()); - - const auto batchId = ov::layout::batch_idx(layout); - const auto heightId = ov::layout::height_idx(layout); - const auto widthId = ov::layout::width_idx(layout); - - if (inputShape[heightId] % stride || inputShape[widthId] % stride) { - throw std::logic_error("The shape of the model input must be divisible by stride"); - } - - netInputHeight = static_cast((netInputHeight + stride - 1) / stride) * stride; - netInputWidth = static_cast((netInputWidth + stride - 1) / stride) * stride; - - inputShape[batchId] = 1; - inputShape[heightId] = netInputHeight; - inputShape[widthId] = netInputWidth; - - model->reshape(inputShape); -} - -std::shared_ptr JPEGRestorationModel::preprocess(const InputData& inputData, - InferenceInput& input) { - cv::Mat image = inputData.asRef().inputImage; - const size_t h = image.rows; - const size_t w = image.cols; - cv::Mat resizedImage; - if (jpegCompression) { - std::vector encimg; - std::vector params{cv::IMWRITE_JPEG_QUALITY, 40}; - cv::imencode(".jpg", image, encimg, params); - image = cv::imdecode(cv::Mat(encimg), 3); - } - - if (netInputHeight - stride < h && h <= netInputHeight && netInputWidth - stride < w && w <= netInputWidth) { - int bottom = netInputHeight - h; - int right = netInputWidth - w; - cv::copyMakeBorder(image, resizedImage, 0, bottom, 0, right, cv::BORDER_CONSTANT, 0); - } else { - slog::warn << "\tChosen model aspect ratio doesn't match image aspect ratio" << slog::endl; - cv::resize(image, resizedImage, cv::Size(netInputWidth, netInputHeight)); - } - input.emplace(inputNames[0], wrapMat2Tensor(resizedImage)); - - return std::make_shared(image.cols, image.rows); -} - -std::unique_ptr JPEGRestorationModel::postprocess(InferenceResult& infResult) { - ImageResult* result = new ImageResult; - *static_cast(result) = static_cast(infResult); - - const auto& inputImgSize = infResult.internalModelData->asRef(); - const auto outputData = infResult.getFirstOutputTensor().data(); - - std::vector imgPlanes; - const ov::Shape& outputShape = infResult.getFirstOutputTensor().get_shape(); - const size_t outHeight = static_cast(outputShape[2]); - const size_t outWidth = static_cast(outputShape[3]); - const size_t numOfPixels = outWidth * outHeight; - imgPlanes = std::vector{cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[0])), - cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels])), - cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels * 2]))}; - cv::Mat resultImg; - cv::merge(imgPlanes, resultImg); - - if (netInputHeight - stride < static_cast(inputImgSize.inputImgHeight) && - static_cast(inputImgSize.inputImgHeight) <= netInputHeight && - netInputWidth - stride < static_cast(inputImgSize.inputImgWidth) && - static_cast(inputImgSize.inputImgWidth) <= netInputWidth) { - result->resultImage = resultImg(cv::Rect(0, 0, inputImgSize.inputImgWidth, inputImgSize.inputImgHeight)); - } else { - cv::resize(resultImg, result->resultImage, cv::Size(inputImgSize.inputImgWidth, inputImgSize.inputImgHeight)); - } - - result->resultImage.convertTo(result->resultImage, CV_8UC3, 255); - - return std::unique_ptr(result); -} diff --git a/model_api/cpp/models/src/openpose_decoder.cpp b/model_api/cpp/models/src/openpose_decoder.cpp deleted file mode 100644 index 6d519fef..00000000 --- a/model_api/cpp/models/src/openpose_decoder.cpp +++ /dev/null @@ -1,345 +0,0 @@ -/* -// Copyright (C) 2020-2024 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "models/openpose_decoder.h" - -#include -#include -#include -#include -#include - -#include - -#include "models/results.h" - -Peak::Peak(const int id, const cv::Point2f& pos, const float score) : id(id), pos(pos), score(score) {} - -HumanPoseByPeaksIndices::HumanPoseByPeaksIndices(const int keypointsNumber) - : peaksIndices(std::vector(keypointsNumber, -1)), - nJoints(0), - score(0.0f) {} - -TwoJointsConnection::TwoJointsConnection(const int firstJointIdx, const int secondJointIdx, const float score) - : firstJointIdx(firstJointIdx), - secondJointIdx(secondJointIdx), - score(score) {} - -void findPeaks(const std::vector& heatMaps, - const float minPeaksDistance, - std::vector>& allPeaks, - int heatMapId, - float confidence_threshold) { - std::vector peaks; - const cv::Mat& heatMap = heatMaps[heatMapId]; - const float* heatMapData = heatMap.ptr(); - size_t heatMapStep = heatMap.step1(); - for (int y = -1; y < heatMap.rows + 1; y++) { - for (int x = -1; x < heatMap.cols + 1; x++) { - float val = 0; - if (x >= 0 && y >= 0 && x < heatMap.cols && y < heatMap.rows) { - val = heatMapData[y * heatMapStep + x]; - val = val >= confidence_threshold ? val : 0; - } - - float left_val = 0; - if (y >= 0 && x < (heatMap.cols - 1) && y < heatMap.rows) { - left_val = heatMapData[y * heatMapStep + x + 1]; - left_val = left_val >= confidence_threshold ? left_val : 0; - } - - float right_val = 0; - if (x > 0 && y >= 0 && y < heatMap.rows) { - right_val = heatMapData[y * heatMapStep + x - 1]; - right_val = right_val >= confidence_threshold ? right_val : 0; - } - - float top_val = 0; - if (x >= 0 && x < heatMap.cols && y < (heatMap.rows - 1)) { - top_val = heatMapData[(y + 1) * heatMapStep + x]; - top_val = top_val >= confidence_threshold ? top_val : 0; - } - - float bottom_val = 0; - if (x >= 0 && y > 0 && x < heatMap.cols) { - bottom_val = heatMapData[(y - 1) * heatMapStep + x]; - bottom_val = bottom_val >= confidence_threshold ? bottom_val : 0; - } - - if ((val > left_val) && (val > right_val) && (val > top_val) && (val > bottom_val)) { - peaks.push_back(cv::Point(x, y)); - } - } - } - std::sort(peaks.begin(), peaks.end(), [](const cv::Point& a, const cv::Point& b) { - return a.x < b.x; - }); - std::vector isActualPeak(peaks.size(), true); - int peakCounter = 0; - std::vector& peaksWithScoreAndID = allPeaks[heatMapId]; - for (size_t i = 0; i < peaks.size(); i++) { - if (isActualPeak[i]) { - for (size_t j = i + 1; j < peaks.size(); j++) { - if (sqrt((peaks[i].x - peaks[j].x) * (peaks[i].x - peaks[j].x) + - (peaks[i].y - peaks[j].y) * (peaks[i].y - peaks[j].y)) < minPeaksDistance) { - isActualPeak[j] = false; - } - } - peaksWithScoreAndID.push_back(Peak(peakCounter++, peaks[i], heatMap.at(peaks[i]))); - } - } -} - -std::vector groupPeaksToPoses(const std::vector>& allPeaks, - const std::vector& pafs, - const size_t keypointsNumber, - const float midPointsScoreThreshold, - const float foundMidPointsRatioThreshold, - const int minJointsNumber, - const float minSubsetScore) { - static const std::pair limbIdsHeatmap[] = {{2, 3}, - {2, 6}, - {3, 4}, - {4, 5}, - {6, 7}, - {7, 8}, - {2, 9}, - {9, 10}, - {10, 11}, - {2, 12}, - {12, 13}, - {13, 14}, - {2, 1}, - {1, 15}, - {15, 17}, - {1, 16}, - {16, 18}, - {3, 17}, - {6, 18}}; - static const std::pair limbIdsPaf[] = {{31, 32}, - {39, 40}, - {33, 34}, - {35, 36}, - {41, 42}, - {43, 44}, - {19, 20}, - {21, 22}, - {23, 24}, - {25, 26}, - {27, 28}, - {29, 30}, - {47, 48}, - {49, 50}, - {53, 54}, - {51, 52}, - {55, 56}, - {37, 38}, - {45, 46}}; - - std::vector candidates; - for (const auto& peaks : allPeaks) { - candidates.insert(candidates.end(), peaks.begin(), peaks.end()); - } - std::vector subset(0, HumanPoseByPeaksIndices(keypointsNumber)); - for (size_t k = 0; k < arraySize(limbIdsPaf); k++) { - std::vector connections; - const int mapIdxOffset = keypointsNumber + 1; - std::pair scoreMid = {pafs[limbIdsPaf[k].first - mapIdxOffset], - pafs[limbIdsPaf[k].second - mapIdxOffset]}; - const int idxJointA = limbIdsHeatmap[k].first - 1; - const int idxJointB = limbIdsHeatmap[k].second - 1; - const std::vector& candA = allPeaks[idxJointA]; - const std::vector& candB = allPeaks[idxJointB]; - const size_t nJointsA = candA.size(); - const size_t nJointsB = candB.size(); - if (nJointsA == 0 && nJointsB == 0) { - continue; - } else if (nJointsA == 0) { - for (size_t i = 0; i < nJointsB; i++) { - int num = 0; - for (size_t j = 0; j < subset.size(); j++) { - if (subset[j].peaksIndices[idxJointB] == candB[i].id) { - num++; - continue; - } - } - if (num == 0) { - HumanPoseByPeaksIndices personKeypoints(keypointsNumber); - personKeypoints.peaksIndices[idxJointB] = candB[i].id; - personKeypoints.nJoints = 1; - personKeypoints.score = candB[i].score; - subset.push_back(personKeypoints); - } - } - continue; - } else if (nJointsB == 0) { - for (size_t i = 0; i < nJointsA; i++) { - int num = 0; - for (size_t j = 0; j < subset.size(); j++) { - if (subset[j].peaksIndices[idxJointA] == candA[i].id) { - num++; - continue; - } - } - if (num == 0) { - HumanPoseByPeaksIndices personKeypoints(keypointsNumber); - personKeypoints.peaksIndices[idxJointA] = candA[i].id; - personKeypoints.nJoints = 1; - personKeypoints.score = candA[i].score; - subset.push_back(personKeypoints); - } - } - continue; - } - - std::vector tempJointConnections; - for (size_t i = 0; i < nJointsA; i++) { - for (size_t j = 0; j < nJointsB; j++) { - cv::Point2f pt = candA[i].pos * 0.5 + candB[j].pos * 0.5; - cv::Point mid = cv::Point(cvRound(pt.x), cvRound(pt.y)); - cv::Point2f vec = candB[j].pos - candA[i].pos; - double norm_vec = cv::norm(vec); - if (norm_vec == 0) { - continue; - } - vec /= norm_vec; - float score = vec.x * scoreMid.first.at(mid) + vec.y * scoreMid.second.at(mid); - int height_n = pafs[0].rows / 2; - float suc_ratio = 0.0f; - float mid_score = 0.0f; - const int mid_num = 10; - const float scoreThreshold = -100.0f; - if (score > scoreThreshold) { - float p_sum = 0; - int p_count = 0; - cv::Size2f step((candB[j].pos.x - candA[i].pos.x) / (mid_num - 1), - (candB[j].pos.y - candA[i].pos.y) / (mid_num - 1)); - for (int n = 0; n < mid_num; n++) { - cv::Point midPoint(cvRound(candA[i].pos.x + n * step.width), - cvRound(candA[i].pos.y + n * step.height)); - cv::Point2f pred(scoreMid.first.at(midPoint), scoreMid.second.at(midPoint)); - score = vec.x * pred.x + vec.y * pred.y; - if (score > midPointsScoreThreshold) { - p_sum += score; - p_count++; - } - } - suc_ratio = static_cast(p_count / mid_num); - float ratio = p_count > 0 ? p_sum / p_count : 0.0f; - mid_score = ratio + static_cast(std::min(height_n / norm_vec - 1, 0.0)); - } - if (mid_score > 0 && suc_ratio > foundMidPointsRatioThreshold) { - tempJointConnections.push_back(TwoJointsConnection(i, j, mid_score)); - } - } - } - if (!tempJointConnections.empty()) { - std::sort(tempJointConnections.begin(), - tempJointConnections.end(), - [](const TwoJointsConnection& a, const TwoJointsConnection& b) { - return (a.score > b.score); - }); - } - size_t num_limbs = std::min(nJointsA, nJointsB); - size_t cnt = 0; - std::vector occurA(nJointsA, 0); - std::vector occurB(nJointsB, 0); - for (size_t row = 0; row < tempJointConnections.size(); row++) { - if (cnt == num_limbs) { - break; - } - const int& indexA = tempJointConnections[row].firstJointIdx; - const int& indexB = tempJointConnections[row].secondJointIdx; - const float& score = tempJointConnections[row].score; - if (occurA[indexA] == 0 && occurB[indexB] == 0) { - connections.push_back(TwoJointsConnection(candA[indexA].id, candB[indexB].id, score)); - cnt++; - occurA[indexA] = 1; - occurB[indexB] = 1; - } - } - if (connections.empty()) { - continue; - } - - bool extraJointConnections = (k == 17 || k == 18); - if (k == 0) { - subset = std::vector(connections.size(), HumanPoseByPeaksIndices(keypointsNumber)); - for (size_t i = 0; i < connections.size(); i++) { - const int& indexA = connections[i].firstJointIdx; - const int& indexB = connections[i].secondJointIdx; - subset[i].peaksIndices[idxJointA] = indexA; - subset[i].peaksIndices[idxJointB] = indexB; - subset[i].nJoints = 2; - subset[i].score = candidates[indexA].score + candidates[indexB].score + connections[i].score; - } - } else if (extraJointConnections) { - for (size_t i = 0; i < connections.size(); i++) { - const int& indexA = connections[i].firstJointIdx; - const int& indexB = connections[i].secondJointIdx; - for (size_t j = 0; j < subset.size(); j++) { - if (subset[j].peaksIndices[idxJointA] == indexA && subset[j].peaksIndices[idxJointB] == -1) { - subset[j].peaksIndices[idxJointB] = indexB; - } else if (subset[j].peaksIndices[idxJointB] == indexB && subset[j].peaksIndices[idxJointA] == -1) { - subset[j].peaksIndices[idxJointA] = indexA; - } - } - } - continue; - } else { - for (size_t i = 0; i < connections.size(); i++) { - const int& indexA = connections[i].firstJointIdx; - const int& indexB = connections[i].secondJointIdx; - bool num = false; - for (size_t j = 0; j < subset.size(); j++) { - if (subset[j].peaksIndices[idxJointA] == indexA) { - subset[j].peaksIndices[idxJointB] = indexB; - subset[j].nJoints++; - subset[j].score += candidates[indexB].score + connections[i].score; - num = true; - } - } - if (!num) { - HumanPoseByPeaksIndices hpWithScore(keypointsNumber); - hpWithScore.peaksIndices[idxJointA] = indexA; - hpWithScore.peaksIndices[idxJointB] = indexB; - hpWithScore.nJoints = 2; - hpWithScore.score = candidates[indexA].score + candidates[indexB].score + connections[i].score; - subset.push_back(hpWithScore); - } - } - } - } - std::vector poses; - for (const auto& subsetI : subset) { - if (subsetI.nJoints < minJointsNumber || subsetI.score / subsetI.nJoints < minSubsetScore) { - continue; - } - int position = -1; - HumanPose pose{std::vector(keypointsNumber, cv::Point2f(-1.0f, -1.0f)), - subsetI.score * std::max(0, subsetI.nJoints - 1)}; - for (const auto& peakIdx : subsetI.peaksIndices) { - position++; - if (peakIdx >= 0) { - pose.keypoints[position] = candidates[peakIdx].pos; - pose.keypoints[position].x += 0.5; - pose.keypoints[position].y += 0.5; - } - } - poses.push_back(pose); - } - return poses; -} diff --git a/model_api/cpp/models/src/style_transfer_model.cpp b/model_api/cpp/models/src/style_transfer_model.cpp deleted file mode 100644 index d245e8f7..00000000 --- a/model_api/cpp/models/src/style_transfer_model.cpp +++ /dev/null @@ -1,107 +0,0 @@ -/* -// Copyright (C) 2021-2024 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "models/style_transfer_model.h" - -#include - -#include -#include -#include -#include - -#include -#include -#include - -#include -#include - -#include "models/input_data.h" -#include "models/internal_model_data.h" -#include "models/results.h" - -StyleTransferModel::StyleTransferModel(const std::string& modelFile, const std::string& layout) - : ImageModel(modelFile, "standard", false, layout) {} - -void StyleTransferModel::prepareInputsOutputs(std::shared_ptr& model) { - // --------------------------- Configure input & output --------------------------------------------- - // --------------------------- Prepare input -------------------------------------------------- - if (model->inputs().size() != 1) { - throw std::logic_error("Style transfer model wrapper supports topologies with only 1 input"); - } - - inputNames.push_back(model->input().get_any_name()); - - const ov::Shape& inputShape = model->input().get_shape(); - ov::Layout inputLayout = getInputLayout(model->input()); - - if (inputShape.size() != 4 || inputShape[ov::layout::batch_idx(inputLayout)] != 1 || - inputShape[ov::layout::channels_idx(inputLayout)] != 3) { - throw std::logic_error("3-channel 4-dimensional model's input is expected"); - } - - netInputWidth = inputShape[ov::layout::width_idx(inputLayout)]; - netInputHeight = inputShape[ov::layout::height_idx(inputLayout)]; - - ov::preprocess::PrePostProcessor ppp(model); - ppp.input().preprocess().convert_element_type(ov::element::f32); - ppp.input().tensor().set_element_type(ov::element::u8).set_layout("NHWC"); - - ppp.input().model().set_layout(inputLayout); - - // --------------------------- Prepare output ----------------------------------------------------- - const ov::OutputVector& outputs = model->outputs(); - if (outputs.size() != 1) { - throw std::logic_error("Style transfer model wrapper supports topologies with only 1 output"); - } - outputNames.push_back(model->output().get_any_name()); - - const ov::Shape& outputShape = model->output().get_shape(); - ov::Layout outputLayout{"NCHW"}; - if (outputShape.size() != 4 || outputShape[ov::layout::batch_idx(outputLayout)] != 1 || - outputShape[ov::layout::channels_idx(outputLayout)] != 3) { - throw std::logic_error("3-channel 4-dimensional model's output is expected"); - } - - ppp.output().tensor().set_element_type(ov::element::f32); - model = ppp.build(); -} - -std::unique_ptr StyleTransferModel::postprocess(InferenceResult& infResult) { - ImageResult* result = new ImageResult; - *static_cast(result) = static_cast(infResult); - - const auto& inputImgSize = infResult.internalModelData->asRef(); - const auto outputData = infResult.getFirstOutputTensor().data(); - - const ov::Shape& outputShape = infResult.getFirstOutputTensor().get_shape(); - size_t outHeight = static_cast(outputShape[2]); - size_t outWidth = static_cast(outputShape[3]); - size_t numOfPixels = outWidth * outHeight; - - std::vector imgPlanes; - imgPlanes = std::vector{cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels * 2])), - cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels])), - cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[0]))}; - cv::Mat resultImg; - cv::merge(imgPlanes, resultImg); - cv::resize(resultImg, result->resultImage, cv::Size(inputImgSize.inputImgWidth, inputImgSize.inputImgHeight)); - - result->resultImage.convertTo(result->resultImage, CV_8UC3); - - return std::unique_ptr(result); -} diff --git a/model_api/cpp/models/src/super_resolution_model.cpp b/model_api/cpp/models/src/super_resolution_model.cpp deleted file mode 100644 index 4eeea771..00000000 --- a/model_api/cpp/models/src/super_resolution_model.cpp +++ /dev/null @@ -1,207 +0,0 @@ -/* -// Copyright (C) 2021-2024 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -*/ - -#include "models/super_resolution_model.h" - -#include - -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include - -#include "models/input_data.h" -#include "models/internal_model_data.h" -#include "models/results.h" - -SuperResolutionModel::SuperResolutionModel(const std::string& modelFile, - const cv::Size& inputImgSize, - const std::string& layout) - : ImageModel(modelFile, "standard", false, layout) { - netInputHeight = inputImgSize.height; - netInputWidth = inputImgSize.width; -} - -void SuperResolutionModel::prepareInputsOutputs(std::shared_ptr& model) { - // --------------------------- Configure input & output --------------------------------------------- - // --------------------------- Prepare input -------------------------------------------------- - const ov::OutputVector& inputs = model->inputs(); - if (inputs.size() != 1 && inputs.size() != 2) { - throw std::logic_error("Super resolution model wrapper supports topologies with 1 or 2 inputs only"); - } - std::string lrInputTensorName = inputs.begin()->get_any_name(); - inputNames.push_back(lrInputTensorName); - ov::Shape lrShape = inputs.begin()->get_shape(); - if (lrShape.size() != 4) { - throw std::logic_error("Number of dimensions for an input must be 4"); - } - // in case of 2 inputs they have the same layouts - ov::Layout inputLayout = getInputLayout(model->inputs().front()); - - auto channelsId = ov::layout::channels_idx(inputLayout); - auto heightId = ov::layout::height_idx(inputLayout); - auto widthId = ov::layout::width_idx(inputLayout); - - if (lrShape[channelsId] != 1 && lrShape[channelsId] != 3) { - throw std::logic_error("Input layer is expected to have 1 or 3 channels"); - } - - // A model like single-image-super-resolution-???? may take bicubic interpolation of the input image as the - // second input - if (inputs.size() == 2) { - std::string bicInputTensorName; - bicInputTensorName = (++inputs.begin())->get_any_name(); - inputNames.push_back(bicInputTensorName); - ov::Shape bicShape = (++inputs.begin())->get_shape(); - if (bicShape.size() != 4) { - throw std::logic_error("Number of dimensions for both inputs must be 4"); - } - if (lrShape[widthId] >= bicShape[widthId] && lrShape[heightId] >= bicShape[heightId]) { - std::swap(bicShape, lrShape); - inputNames[0].swap(inputNames[1]); - } else if (!(lrShape[widthId] <= bicShape[widthId] && lrShape[heightId] <= bicShape[heightId])) { - throw std::logic_error("Each spatial dimension of one input must surpass or be equal to a spatial" - "dimension of another input"); - } - } - - ov::preprocess::PrePostProcessor ppp(model); - for (const auto& input : inputs) { - inputTransform.setPrecision(ppp, input.get_any_name()); - ppp.input().tensor().set_layout("NHWC"); - ppp.input().model().set_layout(inputLayout); - } - - // --------------------------- Prepare output ----------------------------------------------------- - const ov::OutputVector& outputs = model->outputs(); - if (outputs.size() != 1) { - throw std::logic_error("Super resolution model wrapper supports topologies with only 1 output"); - } - - outputNames.push_back(outputs.begin()->get_any_name()); - ppp.output().tensor().set_element_type(ov::element::f32); - model = ppp.build(); - - const ov::Shape& outShape = model->output().get_shape(); - - const ov::Layout outputLayout("NCHW"); - const auto outWidth = outShape[ov::layout::width_idx(outputLayout)]; - const auto inWidth = lrShape[ov::layout::width_idx(outputLayout)]; - changeInputSize(model, static_cast(outWidth / inWidth)); -} - -void SuperResolutionModel::changeInputSize(std::shared_ptr& model, int coeff) { - std::map shapes; - const ov::Layout& layout = ov::layout::get_layout(model->inputs().front()); - const auto batchId = ov::layout::batch_idx(layout); - const auto heightId = ov::layout::height_idx(layout); - const auto widthId = ov::layout::width_idx(layout); - - const ov::OutputVector& inputs = model->inputs(); - std::string lrInputTensorName = inputs.begin()->get_any_name(); - ov::Shape lrShape = inputs.begin()->get_shape(); - - if (inputs.size() == 2) { - std::string bicInputTensorName = (++inputs.begin())->get_any_name(); - ov::Shape bicShape = (++inputs.begin())->get_shape(); - if (lrShape[heightId] >= bicShape[heightId] && lrShape[widthId] >= bicShape[widthId]) { - std::swap(bicShape, lrShape); - std::swap(bicInputTensorName, lrInputTensorName); - } - bicShape[batchId] = 1; - bicShape[heightId] = coeff * netInputHeight; - bicShape[widthId] = coeff * netInputWidth; - shapes[bicInputTensorName] = ov::PartialShape(bicShape); - } - - lrShape[batchId] = 1; - lrShape[heightId] = netInputHeight; - lrShape[widthId] = netInputWidth; - shapes[lrInputTensorName] = ov::PartialShape(lrShape); - - model->reshape(shapes); -} - -std::shared_ptr SuperResolutionModel::preprocess(const InputData& inputData, - InferenceInput& input) { - auto imgData = inputData.asRef(); - auto img = inputTransform(imgData.inputImage); - - auto lrShape = inferenceAdapter->getInputShape(inputNames[0]).get_max_shape(); - const ov::Layout layout("NHWC"); - - if (img.channels() != static_cast(lrShape[ov::layout::channels_idx(layout)])) { - cv::cvtColor(img, img, cv::COLOR_BGR2GRAY); - } - - if (static_cast(img.cols) != netInputWidth || static_cast(img.rows) != netInputHeight) { - slog::warn << "\tChosen model aspect ratio doesn't match image aspect ratio" << slog::endl; - } - const size_t height = lrShape[ov::layout::height_idx(layout)]; - const size_t width = lrShape[ov::layout::width_idx(layout)]; - img = resizeImageExt(img, width, height); - input.emplace(inputNames[0], wrapMat2Tensor(img)); - - if (inputNames.size() == 2) { - auto bicShape = inferenceAdapter->getInputShape(inputNames[1]).get_max_shape(); - const int h = static_cast(bicShape[ov::layout::height_idx(layout)]); - const int w = static_cast(bicShape[ov::layout::width_idx(layout)]); - cv::Mat resized; - cv::resize(img, resized, cv::Size(w, h), 0, 0, cv::INTER_CUBIC); - input.emplace(inputNames[1], wrapMat2Tensor(resized)); - } - - return std::make_shared(img.cols, img.rows); -} - -std::unique_ptr SuperResolutionModel::postprocess(InferenceResult& infResult) { - ImageResult* result = new ImageResult; - *static_cast(result) = static_cast(infResult); - const auto outputData = infResult.getFirstOutputTensor().data(); - - std::vector imgPlanes; - const ov::Shape& outShape = infResult.getFirstOutputTensor().get_shape(); - const size_t outChannels = static_cast(outShape[1]); - const size_t outHeight = static_cast(outShape[2]); - const size_t outWidth = static_cast(outShape[3]); - const size_t numOfPixels = outWidth * outHeight; - if (outChannels == 3) { - imgPlanes = std::vector{cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[0])), - cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels])), - cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels * 2]))}; - } else { - imgPlanes = std::vector{cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[0]))}; - // Post-processing for text-image-super-resolution models - cv::threshold(imgPlanes[0], imgPlanes[0], 0.5f, 1.0f, cv::THRESH_BINARY); - } - - for (auto& img : imgPlanes) { - img.convertTo(img, CV_8UC1, 255); - } - cv::Mat resultImg; - cv::merge(imgPlanes, resultImg); - result->resultImage = resultImg; - - return std::unique_ptr(result); -} diff --git a/model_api/python/README.md b/model_api/python/README.md index b7bdcbcf..1be4d327 100644 --- a/model_api/python/README.md +++ b/model_api/python/README.md @@ -53,24 +53,19 @@ python -c "from openvino.model_zoo import model_api" ## Model API Wrappers -The Model API package provides model wrappers, which implement standardized preprocessing/postprocessing functions per "task type" and incapsulate model-specific logic for usage of different models in a unified manner inside the application. +The Model API package provides model wrappers, which implement standardized preprocessing/postprocessing functions per "task type" and encapsulate model-specific logic for usage of different models in a unified manner inside the application. The following tasks can be solved with wrappers usage: | Task type | Model API wrappers | |----------------------------|--------------------| -| Background Matting |
  • `VideoBackgroundMatting`
  • `ImageMattingWithBackground`
  • `PortraitBackgroundMatting`
| | Classification |
  • `ClassificationModel`
| -| Deblurring |
  • `Deblurring`
| -| Human Pose Estimation |
  • `HpeAssociativeEmbedding`
  • `OpenPose`
| -| Instance Segmentation |
  • `MaskRCNNModel`
  • `YolactModel`
| -| Monocular Depth Estimation |
  • `MonoDepthModel`
| -| Named Entity Recognition |
  • `BertNamedEntityRecognition`
| -| Object Detection |
  • `CenterNet`
  • `DETR`
  • `CTPN`
  • `FaceBoxes`
  • `NanoDet`
  • `NanoDetPlus`
  • `RetinaFace`
  • `RetinaFacePyTorch`
  • `SSD`
  • `UltraLightweightFaceDetection`
  • `YOLO`
  • `YoloV3ONNX`
  • `YoloV4`
  • `YOLOF`
  • `YOLOX`
| -| Question Answering |
  • `BertQuestionAnswering`
| -| Salient Object Detection |
  • `SalientObjectDetectionModel`
| -| Semantic Segmentation |
  • `SegmentationModel`
| -| Action Classification |
  • `ActionClassificationModel`
| +| Human Pose Estimation |
  • `KeypointDetectionModel`
  • | +| Instance Segmentation |
    • `MaskRCNNModel`
    | +| Object Detection |
    • `SSD`
    • `YOLO`
    • `YoloV3ONNX`
    • `YoloV4`
    • `YOLOF`
    • `YOLOX`
    | +| Semantic Segmentation |
    • `SegmentationModel`
    | +| Visual Prompting |
    • `SAMDecoder`
    • `SAMImageEncoder`
    | +| Action Classification |
    • `ActionClassificationModel`
    | ## Model API Adapters diff --git a/model_api/python/model_api/models/__init__.py b/model_api/python/model_api/models/__init__.py index 71eec9c2..86a4a61c 100644 --- a/model_api/python/model_api/models/__init__.py +++ b/model_api/python/model_api/models/__init__.py @@ -16,32 +16,15 @@ from .action_classification import ActionClassificationModel from .anomaly import AnomalyDetection -from .background_matting import ( - ImageMattingWithBackground, - PortraitBackgroundMatting, - VideoBackgroundMatting, -) -from .bert import BertEmbedding, BertNamedEntityRecognition, BertQuestionAnswering -from .centernet import CenterNet from .classification import ClassificationModel -from .ctpn import CTPN -from .deblurring import Deblurring from .detection_model import DetectionModel -from .detr import DETR -from .faceboxes import FaceBoxes -from .hpe_associative_embedding import HpeAssociativeEmbedding from .image_model import ImageModel -from .instance_segmentation import MaskRCNNModel, YolactModel +from .instance_segmentation import MaskRCNNModel from .keypoint_detection import KeypointDetectionModel, TopDownKeypointDetectionPipeline from .model import Model -from .monodepth import MonoDepthModel -from .nanodet import NanoDet, NanoDetPlus -from .open_pose import OpenPose -from .retinaface import RetinaFace, RetinaFacePyTorch from .sam_models import SAMDecoder, SAMImageEncoder from .segmentation import SalientObjectDetectionModel, SegmentationModel from .ssd import SSD -from .ultra_lightweight_face_detection import UltraLightweightFaceDetection from .utils import ( AnomalyResult, ClassificationResult, @@ -87,19 +70,10 @@ "ActionClassificationModel", "AnomalyDetection", "AnomalyResult", - "BertEmbedding", - "BertNamedEntityRecognition", - "BertQuestionAnswering", - "CenterNet", "ClassificationModel", "Contour", - "CTPN", - "Deblurring", "DetectionModel", "DetectionWithLandmarks", - "DETR", - "FaceBoxes", - "HpeAssociativeEmbedding", "ImageMattingWithBackground", "ImageModel", "ImageResultWithSoftPrediction", @@ -113,20 +87,12 @@ "TopDownKeypointDetectionPipeline", "MaskRCNNModel", "Model", - "MonoDepthModel", - "NanoDet", - "NanoDetPlus", - "OpenPose", "OutputTransform", "PortraitBackgroundMatting", - "RetinaFace", - "RetinaFacePyTorch", "SalientObjectDetectionModel", "SegmentationModel", "SSD", - "UltraLightweightFaceDetection", "VideoBackgroundMatting", - "YolactModel", "YOLO", "YoloV3ONNX", "YoloV4", diff --git a/model_api/python/model_api/models/background_matting.py b/model_api/python/model_api/models/background_matting.py deleted file mode 100644 index c887376b..00000000 --- a/model_api/python/model_api/models/background_matting.py +++ /dev/null @@ -1,194 +0,0 @@ -""" - Copyright (c) 2022-2024 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import cv2 -import numpy as np - -from .image_model import ImageModel - - -class VideoBackgroundMatting(ImageModel): - __model__ = "Robust-video-matting" - - def __init__(self, inference_adapter, configuration, preload=False): - super().__init__(inference_adapter, configuration, preload) - self._check_io_number((5,), (6,)) - self.output_blob_name = self._get_outputs() - self.rec_map = self.get_inputs_map() - self.rec = self.initialize_rec() - - @classmethod - def parameters(cls): - return super().parameters() - - def _get_inputs(self): - image_blob_names, image_info_blob_names = [], [] - for name, metadata in self.inputs.items(): - if len(metadata.shape) == 4 and metadata.shape[1] == 3: - image_blob_names.append(name) - if not image_blob_names: - self.raise_error("Compatible inputs are not found") - return image_blob_names, image_info_blob_names - - def _get_outputs(self): - image_blob_names = {} - for name, metadata in self.outputs.items(): - if len(metadata.shape) == 4 and metadata.shape[1] == 3: - image_blob_names["fgr"] = name - elif len(metadata.shape) == 4 and metadata.shape[1] == 1: - image_blob_names["pha"] = name - if len(image_blob_names) != 2: - self.raise_error("Compatible outputs are not found") - return image_blob_names - - def get_inputs_map(self): - rec_map = {} - for in_name, in_meta in self.inputs.items(): - if in_meta.shape[1] not in [1, 3]: - for out_name, out_meta in self.outputs.items(): - if in_meta.shape == out_meta.shape: - rec_map[in_name] = out_name - break - return rec_map - - def preprocess(self, inputs): - dict_inputs, meta = super().preprocess(inputs) - dict_inputs.update(self.rec) - return dict_inputs, meta - - def postprocess(self, outputs, meta): - fgr = outputs[self.output_blob_name["fgr"]] - pha = outputs[self.output_blob_name["pha"]] - self.rec = { - in_name: outputs[out_name] for in_name, out_name in self.rec_map.items() - } - fgr = fgr[0].transpose(1, 2, 0) - pha = pha[0].transpose(1, 2, 0) - h, w = meta["original_shape"][:2] - fgr = cv2.cvtColor(cv2.resize(fgr, (w, h)), cv2.COLOR_RGB2BGR) - pha = np.expand_dims(cv2.resize(pha, (w, h)), axis=-1) - return fgr, pha - - def initialize_rec(self): - rec = {} - for name, metadata in self.inputs.items(): - if name in self.rec_map.keys(): - rec[name] = np.zeros(metadata.shape, dtype=np.float32) - return rec - - -class ImageMattingWithBackground(ImageModel): - __model__ = "Background-matting" - - def __init__(self, inference_adapter, configuration, preload=False): - super().__init__(inference_adapter, configuration, preload) - self._check_io_number((2,), (2, 3)) - self.output_blob_name = self._get_outputs() - self.n, self.c, self.h, self.w = self.set_input_shape() - - @classmethod - def parameters(cls): - return super().parameters() - - def _get_inputs(self): - image_blob_names, image_info_blob_names = [], [] - for name, metadata in self.inputs.items(): - if len(metadata.shape) == 4 and metadata.shape[1] == 3: - image_blob_names.append(name) - if len(image_blob_names) != 2: - self.raise_error("Compatible inputs are not found") - return image_blob_names, image_info_blob_names - - def set_input_shape(self): - shapes = [tuple(self.inputs[name].shape) for name in self.image_blob_names] - if len(set(shapes)) != 1: - self.raise_error("Image inputs have incompatible shapes: {}".format(shapes)) - return shapes[0] - - def _get_outputs(self): - image_blob_names = {} - for name, metadata in self.outputs.items(): - if len(metadata.shape) == 4 and metadata.shape[1] == 3: - image_blob_names["fgr"] = name - elif len(metadata.shape) == 4 and metadata.shape[1] == 1: - image_blob_names["pha"] = name - if len(image_blob_names) != 2: - self.raise_error("Compatible outputs are not found") - return image_blob_names - - def preprocess(self, inputs): - dict_inputs = {} - target_shape = None - for name, image in inputs.items(): - self.image_blob_name = name - dict_input, meta = super().preprocess(image) - dict_inputs.update(dict_input) - if target_shape is None: - target_shape = meta["original_shape"] - elif meta["original_shape"] != target_shape: - self.raise_error( - "Image inputs must have equal shapes but got: {} vs {}".format( - target_shape, meta["original_shape"] - ) - ) - return dict_inputs, meta - - def postprocess(self, outputs, meta): - fgr = outputs[self.output_blob_name["fgr"]] - pha = outputs[self.output_blob_name["pha"]] - fgr = fgr[0].transpose(1, 2, 0) - pha = pha[0].transpose(1, 2, 0) - h, w = meta["original_shape"][:2] - fgr = cv2.cvtColor(cv2.resize(fgr, (w, h)), cv2.COLOR_RGB2BGR) - pha = np.expand_dims(cv2.resize(pha, (w, h)), axis=-1) - return fgr, pha - - -class PortraitBackgroundMatting(ImageModel): - __model__ = "Portrait-matting" - - def __init__(self, inference_adapter, configuration, preload=False): - super().__init__(inference_adapter, configuration, preload) - self._check_io_number(1, 1) - self.output_blob_name = self._get_outputs() - - @classmethod - def parameters(cls): - return super().parameters() - - def _get_outputs(self): - output_blob_name = next(iter(self.outputs)) - output_size = self.outputs[output_blob_name].shape - if len(output_size) != 4: - self.raise_error( - "Unexpected output blob shape {}. Only 4D output blob is supported".format( - output_size - ) - ) - - return output_blob_name - - def preprocess(self, inputs): - dict_inputs, meta = super().preprocess(inputs) - meta.update({"original_image": inputs}) - return dict_inputs, meta - - def postprocess(self, outputs, meta): - output = outputs[self.output_blob_name][0].transpose(1, 2, 0) - original_frame = meta["original_image"] / 255.0 - h, w = meta["original_shape"][:2] - res_output = np.expand_dims(cv2.resize(output, (w, h)), -1) - return original_frame, res_output diff --git a/model_api/python/model_api/models/bert.py b/model_api/python/model_api/models/bert.py deleted file mode 100644 index da134fb9..00000000 --- a/model_api/python/model_api/models/bert.py +++ /dev/null @@ -1,250 +0,0 @@ -""" - Copyright (c) 2021-2024 Intel Corporation - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import numpy as np - -from .model import Model -from .types import BooleanValue, DictValue, NumericalValue, StringValue - - -class Bert(Model): - __model__ = "bert" - - def __init__(self, inference_adapter, configuration, preload=False): - super().__init__(inference_adapter, configuration, preload) - self.token_cls = [self.vocab["[CLS]"]] - self.token_sep = [self.vocab["[SEP]"]] - self.token_pad = [self.vocab["[PAD]"]] - self.input_names = [i.strip() for i in self.input_names.split(",")] - if self.inputs.keys() != set(self.input_names): - self.raise_error( - "The Wrapper expects input names: {}, actual network input names: {}".format( - self.input_names, list(self.inputs.keys()) - ) - ) - self.max_length = self.inputs[self.input_names[0]].shape[1] - - @classmethod - def parameters(cls): - parameters = super().parameters() - parameters.update( - { - "vocab": DictValue(), - "input_names": StringValue( - description="Comma-separated names of input layers" - ), - "enable_padding": BooleanValue( - description="Should be input sequence padded to max sequence len or not", - default_value=True, - ), - } - ) - return parameters - - def preprocess(self, inputs): - input_ids, attention_mask, token_type_ids = self.form_request(inputs) - pad_len = ( - self.pad_input(input_ids, attention_mask, token_type_ids) - if self.enable_padding - else 0 - ) - meta = {"pad_len": pad_len, "inputs": inputs} - - return self.create_input_dict(input_ids, attention_mask, token_type_ids), meta - - def form_request(self, inputs): - raise NotImplementedError - - def pad_input(self, input_ids, attention_mask, token_type_ids): - pad_len = self.max_length - len(input_ids) - if pad_len < 0: - self.raise_error( - "The input request is longer than max number of tokens ({})" - " processed by model".format(self.max_length) - ) - input_ids += self.token_pad * pad_len - token_type_ids += [0] * pad_len - attention_mask += [0] * pad_len - return pad_len - - def create_input_dict(self, input_ids, attention_mask, token_type_ids): - inputs = { - self.input_names[0]: np.array([input_ids], dtype=np.int32), - self.input_names[1]: np.array([attention_mask], dtype=np.int32), - self.input_names[2]: np.array([token_type_ids], dtype=np.int32), - } - if len(self.input_names) > 3: - inputs[self.input_names[3]] = np.arange(len(input_ids), dtype=np.int32)[ - None, : - ] - - return inputs - - def reshape(self, new_length): - new_shapes = {} - for input_name, input_info in self.inputs.items(): - new_shapes[input_name] = [1, new_length] - default_input_shape = input_info.shape - super().reshape(new_shapes) - self.logger.debug( - "\tReshape model from {} to {}".format( - default_input_shape, new_shapes[input_name] - ) - ) - self.max_length = ( - new_length if not isinstance(new_length, tuple) else new_length[1] - ) - - -class BertNamedEntityRecognition(Bert): - __model__ = "bert-named-entity-recognition" - - def __init__(self, inference_adapter, configuration, preload=False): - super().__init__(inference_adapter, configuration, preload) - - self.output_names = list(self.outputs) - self._check_io_number(-1, 1) - - def form_request(self, inputs): - c_tokens_id = inputs - input_ids = self.token_cls + c_tokens_id + self.token_sep - attention_mask = [1] * len(input_ids) - token_type_ids = [0] * len(input_ids) - return input_ids, attention_mask, token_type_ids - - def postprocess(self, outputs, meta): - output = outputs[self.output_names[0]] - output = np.exp(output[0]) - score = output / output.sum(axis=-1, keepdims=True) - labels_id = score.argmax(-1) - - filtered_labels_id = [ - (i, label_i) - for i, label_i in enumerate(labels_id) - if label_i != 0 and 0 < i < self.max_length - meta["pad_len"] - 1 - ] - return score, filtered_labels_id - - -class BertEmbedding(Bert): - __model__ = "bert-embedding" - - def __init__(self, inference_adapter, configuration, preload=False): - super().__init__(inference_adapter, configuration, preload) - - self.output_names = list(self.outputs) - self._check_io_number(-1, 1) - - def form_request(self, inputs): - tokens_id, self.max_length = inputs - input_ids = self.token_cls + tokens_id + self.token_sep - attention_mask = [1] * len(input_ids) - token_type_ids = [0] * len(input_ids) - return input_ids, attention_mask, token_type_ids - - def postprocess(self, outputs, meta): - output = outputs[self.output_names[0]] - return output.squeeze(0) - - -class BertQuestionAnswering(Bert): - __model__ = "bert-question-answering" - - def __init__(self, inference_adapter, configuration, preload=False): - super().__init__(inference_adapter, configuration, preload) - - self.output_names = [o.strip() for o in self.output_names.split(",")] - if self.outputs.keys() != set(self.output_names): - self.raise_error( - "The Wrapper expects output names: {}, actual network output names: {}".format( - self.output_names, list(self.outputs.keys()) - ) - ) - - @classmethod - def parameters(cls): - parameters = super().parameters() - parameters.update( - { - "output_names": StringValue( - description="Comma-separated names of output layers" - ), - "max_answer_token_num": NumericalValue(value_type=int), - "squad_ver": StringValue( - description="SQuAD dataset version used for training. Affects postprocessing" - ), - } - ) - return parameters - - def form_request(self, inputs): - c_data, q_tokens_id = inputs - input_ids = ( - self.token_cls - + q_tokens_id - + self.token_sep - + c_data.c_tokens_id - + self.token_sep - ) - attention_mask = [1] * len(input_ids) - token_type_ids = [0] * (len(q_tokens_id) + 2) + [1] * ( - len(c_data.c_tokens_id) + 1 - ) - return input_ids, attention_mask, token_type_ids - - def postprocess(self, outputs, meta): - def get_score(blob_name): - out = np.exp(outputs[blob_name].reshape((self.max_length,))) - return out / out.sum(axis=-1) - - pad_len, (c_data, q_tokens_id) = meta["pad_len"], meta["inputs"] - # get start-end scores for context - score_s = get_score(self.output_names[0]) - score_e = get_score(self.output_names[1]) - - # index of first context token in tensor - c_s_idx = len(q_tokens_id) + 2 - # index of last+1 context token in tensor - c_e_idx = self.max_length - (pad_len + 1) - - # find product of all start-end combinations to find the best one - max_score, max_s, max_e = self.find_best_answer_window( - score_s, score_e, c_s_idx, c_e_idx - ) - - # convert to context text start-end index - max_s = c_data.c_tokens_se[max_s][0] - max_e = c_data.c_tokens_se[max_e][1] - - return max_score, max_s, max_e - - def find_best_answer_window( - self, start_score, end_score, context_start_idx, context_end_idx - ): - # get 'no-answer' score (not valid if model has been fine-tuned on squad1.x) - score_na = 0 if "1." in self.squad_ver else start_score[0] * end_score[0] - - context_len = context_end_idx - context_start_idx - score_mat = np.matmul( - start_score[context_start_idx:context_end_idx].reshape((context_len, 1)), - end_score[context_start_idx:context_end_idx].reshape((1, context_len)), - ) - # reset candidates with end before start - score_mat = np.triu(score_mat) - # reset long candidates (>max_answer_token_num) - score_mat = np.tril(score_mat, self.max_answer_token_num - 1) - # find the best start-end pair - max_s, max_e = divmod(score_mat.flatten().argmax(), score_mat.shape[1]) - max_score = score_mat[max_s, max_e] * (1 - score_na) - - return max_score, max_s, max_e diff --git a/model_api/python/model_api/models/centernet.py b/model_api/python/model_api/models/centernet.py deleted file mode 100644 index 5e9b1da2..00000000 --- a/model_api/python/model_api/models/centernet.py +++ /dev/null @@ -1,205 +0,0 @@ -""" - Copyright (c) 2019-2024 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import cv2 -import numpy as np -from numpy.lib.stride_tricks import as_strided - -from .detection_model import DetectionModel -from .utils import Detection, clip_detections - - -class CenterNet(DetectionModel): - __model__ = "centernet" - - def __init__(self, inference_adapter, configuration=dict(), preload=False): - super().__init__(inference_adapter, configuration, preload) - self._check_io_number(1, 3) - self._output_layer_names = sorted(self.outputs) - - @classmethod - def parameters(cls): - parameters = super().parameters() - parameters["resize_type"].update_default_value("standard") - return parameters - - def postprocess(self, outputs, meta): - heat = outputs[self._output_layer_names[0]][0] - reg = outputs[self._output_layer_names[1]][0] - wh = outputs[self._output_layer_names[2]][0] - heat = np.exp(heat) / (1 + np.exp(heat)) - height, width = heat.shape[1:3] - num_predictions = 100 - - heat = self._nms(heat) - scores, inds, clses, ys, xs = self._topk(heat, K=num_predictions) - reg = self._tranpose_and_gather_feat(reg, inds) - - reg = reg.reshape((num_predictions, 2)) - xs = xs.reshape((num_predictions, 1)) + reg[:, 0:1] - ys = ys.reshape((num_predictions, 1)) + reg[:, 1:2] - - wh = self._tranpose_and_gather_feat(wh, inds) - wh = wh.reshape((num_predictions, 2)) - clses = clses.reshape((num_predictions, 1)) - scores = scores.reshape((num_predictions, 1)) - bboxes = np.concatenate( - ( - xs - wh[..., 0:1] / 2, - ys - wh[..., 1:2] / 2, - xs + wh[..., 0:1] / 2, - ys + wh[..., 1:2] / 2, - ), - axis=1, - ) - detections = np.concatenate((bboxes, scores, clses), axis=1) - mask = detections[..., 4] >= self.confidence_threshold - filtered_detections = detections[mask] - scale = max(meta["original_shape"]) - center = np.array(meta["original_shape"][:2]) / 2.0 - dets = self._transform( - filtered_detections, np.flip(center, 0), scale, height, width - ) - dets = [Detection(x[0], x[1], x[2], x[3], score=x[4], id=x[5]) for x in dets] - return clip_detections(dets, meta["original_shape"]) - - @staticmethod - def get_affine_transform(center, scale, rot, output_size, inv=False): - def get_dir(src_point, rot_rad): - sn, cs = np.sin(rot_rad), np.cos(rot_rad) - src_result = [0, 0] - src_result[0] = src_point[0] * cs - src_point[1] * sn - src_result[1] = src_point[0] * sn + src_point[1] * cs - return src_result - - def get_3rd_point(a, b): - direct = a - b - return b + np.array([-direct[1], direct[0]], dtype=np.float32) - - if not isinstance(scale, np.ndarray) and not isinstance(scale, list): - scale = np.array([scale, scale], dtype=np.float32) - - scale_tmp = scale - src_w = scale_tmp[0] - dst_w, dst_h = output_size - - rot_rad = np.pi * rot / 180 - src_dir = get_dir([0, src_w * -0.5], rot_rad) - dst_dir = np.array([0, dst_w * -0.5], dtype=np.float32) - - dst = np.zeros((3, 2), dtype=np.float32) - src = np.zeros((3, 2), dtype=np.float32) - src[0, :], src[1, :] = center, center + src_dir - dst[0, :] = [dst_w * 0.5, dst_h * 0.5] - dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5], np.float32) + dst_dir - src[2:, :] = get_3rd_point(src[0, :], src[1, :]) - dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :]) - - if inv: - trans = cv2.getAffineTransform(np.float32(dst), np.float32(src)) - else: - trans = cv2.getAffineTransform(np.float32(src), np.float32(dst)) - - return trans - - @staticmethod - def _gather_feat(feat, ind): - dim = feat.shape[1] - ind = np.expand_dims(ind, axis=1) - ind = np.repeat(ind, dim, axis=1) - feat = feat[ind, np.arange(feat.shape[1])] - return feat - - @staticmethod - def _tranpose_and_gather_feat(feat, ind): - feat = np.transpose(feat, (1, 2, 0)) - feat = feat.reshape((-1, feat.shape[2])) - feat = CenterNet._gather_feat(feat, ind) - return feat - - @staticmethod - def _topk(scores, K=40): - cat, _, width = scores.shape - - scores = scores.reshape((cat, -1)) - topk_inds = np.argpartition(scores, -K, axis=1)[:, -K:] - topk_scores = scores[np.arange(scores.shape[0])[:, None], topk_inds] - - topk_ys = (topk_inds / width).astype(np.int32).astype(float) - topk_xs = (topk_inds % width).astype(np.int32).astype(float) - - topk_scores = topk_scores.reshape((-1)) - topk_ind = np.argpartition(topk_scores, -K)[-K:] - topk_score = topk_scores[topk_ind] - topk_clses = topk_ind / K - topk_inds = CenterNet._gather_feat( - topk_inds.reshape((-1, 1)), topk_ind - ).reshape((K)) - topk_ys = CenterNet._gather_feat(topk_ys.reshape((-1, 1)), topk_ind).reshape( - (K) - ) - topk_xs = CenterNet._gather_feat(topk_xs.reshape((-1, 1)), topk_ind).reshape( - (K) - ) - - return topk_score, topk_inds, topk_clses, topk_ys, topk_xs - - @staticmethod - def _nms(heat, kernel=3): - def max_pool2d(A, kernel_size, padding=1, stride=1): - A = np.pad(A, padding, mode="constant") - output_shape = ( - (A.shape[0] - kernel_size) // stride + 1, - (A.shape[1] - kernel_size) // stride + 1, - ) - kernel_size = (kernel_size, kernel_size) - A_w = as_strided( - A, - shape=output_shape + kernel_size, - strides=(stride * A.strides[0], stride * A.strides[1]) + A.strides, - ) - A_w = A_w.reshape(-1, *kernel_size) - - return A_w.max(axis=(1, 2)).reshape(output_shape) - - pad = (kernel - 1) // 2 - - hmax = np.array([max_pool2d(channel, kernel, pad) for channel in heat]) - keep = hmax == heat - return heat * keep - - @staticmethod - def _transform_preds(coords, center, scale, output_size): - def affine_transform(pt, t): - new_pt = np.array([pt[0], pt[1], 1.0], dtype=np.float32).T - new_pt = np.dot(t, new_pt) - return new_pt[:2] - - target_coords = np.zeros(coords.shape) - trans = CenterNet.get_affine_transform(center, scale, 0, output_size, inv=True) - for p in range(coords.shape[0]): - target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans) - return target_coords - - @staticmethod - def _transform(dets, center, scale, height, width): - dets[:, :2] = CenterNet._transform_preds( - dets[:, 0:2], center, scale, (width, height) - ) - dets[:, 2:4] = CenterNet._transform_preds( - dets[:, 2:4], center, scale, (width, height) - ) - return dets diff --git a/model_api/python/model_api/models/ctpn.py b/model_api/python/model_api/models/ctpn.py deleted file mode 100644 index cc5e60ae..00000000 --- a/model_api/python/model_api/models/ctpn.py +++ /dev/null @@ -1,483 +0,0 @@ -""" - Copyright (c) 2021-2024 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import cv2 -import numpy as np - -from .detection_model import DetectionModel -from .types import ListValue, NumericalValue -from .utils import Detection, clip_detections, nms - - -class CTPN(DetectionModel): - __model__ = "CTPN" - - def __init__(self, inference_adapter, configuration=dict(), preload=False): - super().__init__(inference_adapter, configuration, False) - self._check_io_number(1, 2) - self.bboxes_blob_name, self.scores_blob_name = self._get_outputs() - - self.min_size = 8 - self.min_ratio = 0.5 - self.min_width = 32 - self.pre_nms_top_n = 1000 - self.post_nms_top_n = 500 - self.text_proposal_connector = TextProposalConnector() - - self.anchors = np.array( - [ - [0, 2, 15, 13], - [0, 0, 15, 15], - [0, -4, 15, 19], - [0, -9, 15, 24], - [0, -16, 15, 31], - [0, -26, 15, 41], - [0, -41, 15, 56], - [0, -62, 15, 77], - [0, -91, 15, 106], - [0, -134, 15, 149], - ] - ) - - self.h1, self.w1 = self.ctpn_keep_aspect_ratio( - 1200, 600, self.input_size[1], self.input_size[0] - ) - self.h2, self.w2 = self.ctpn_keep_aspect_ratio(600, 600, self.w1, self.h1) - default_input_shape = self.inputs[self.image_blob_name].shape - new_shape = ( - [self.n, self.c, self.h2, self.w2] - if self.nchw_layout - else [self.n, self.h2, self.w2, self.c] - ) - input_shape = {self.image_blob_name: (new_shape)} - self.logger.debug( - "\tReshape model from {} to {}".format( - default_input_shape, input_shape[self.image_blob_name] - ) - ) - self.reshape(input_shape) - if preload: - self.load() - - def _get_outputs(self): - (boxes_name, boxes_data_repr), ( - scores_name, - scores_data_repr, - ) = self.outputs.items() - - if len(boxes_data_repr.shape) != 4 or len(scores_data_repr.shape) != 4: - self.raise_error( - "Unexpected output blob shape. Only 4D output blobs are supported" - ) - - if self.nchw_layout: - scores_channels = scores_data_repr.shape[1] - boxes_channels = boxes_data_repr.shape[1] - else: - scores_channels = scores_data_repr.shape[3] - boxes_channels = boxes_data_repr.shape[3] - - if scores_channels == boxes_channels * 2: - return scores_name, boxes_name - if boxes_channels == scores_channels * 2: - return boxes_name, scores_name - self.raise_error("One of outputs must be two times larger than another") - - @classmethod - def parameters(cls): - parameters = super().parameters() - parameters.update( - { - "iou_threshold": NumericalValue( - default_value=0.5, - description="Threshold for non-maximum suppression (NMS) intersection over union (IOU) filtering", - ), - "input_size": ListValue( - description="Image resolution which is going to be processed. Reshapes network to match a given size" - ), - } - ) - parameters["confidence_threshold"].update_default_value(0.9) - parameters["labels"].update_default_value(["Text"]) - return parameters - - def preprocess(self, inputs): - meta = {"original_shape": inputs.shape} - scales = (self.w1 / inputs.shape[1], self.h1 / inputs.shape[0]) - - if scales[0] < 1 and scales[1] < 1: - meta["scales"] = [scales] - inputs = cv2.resize(inputs, (self.w1, self.h1)) - if ( - self.h2 == 600 - and self.w2 == 600 - or (self.h1 != self.h2 or self.w1 != self.w2) - ): - meta.setdefault("scales", []).append( - (self.w2 / inputs.shape[1], self.h2 / inputs.shape[0]) - ) - inputs = cv2.resize(inputs, (self.w2, self.h2)) - - inputs = self._change_layout(inputs) - dict_inputs = {self.image_blob_name: inputs} - return dict_inputs, meta - - def postprocess(self, outputs, meta): - first_scales = meta["scales"].pop() - boxes = ( - outputs[self.bboxes_blob_name][0].transpose((1, 2, 0)) - if self.nchw_layout - else outputs[self.bboxes_blob_name][0] - ) - scores = ( - outputs[self.scores_blob_name][0].transpose((1, 2, 0)) - if self.nchw_layout - else outputs[self.scores_blob_name][0] - ) - - textsegs, scores = self.get_proposals(scores, boxes, meta["original_shape"]) - textsegs[:, 0::2] /= first_scales[0] - textsegs[:, 1::2] /= first_scales[1] - boxes = self.get_detections( - textsegs, scores[:, np.newaxis], meta["original_shape"] - ) - if meta["scales"]: - second_scales = meta["scales"].pop() - boxes[:, 0:8:2] /= second_scales[0] - boxes[:, 1:8:2] /= second_scales[1] - detections = [ - Detection(box[0], box[1], box[2], box[5], box[8], 0) for box in boxes - ] - return clip_detections(detections, meta["original_shape"]) - - @staticmethod - def ctpn_keep_aspect_ratio(dst_width, dst_height, image_width, image_height): - scale = min(dst_height, dst_width) - max_scale = max(dst_height, dst_width) - im_min_size = min(image_width, image_height) - im_max_size = max(image_width, image_height) - im_scale = float(scale) / float(im_min_size) - if np.round(im_scale * im_max_size) > max_scale: - im_scale = float(max_scale) / float(im_max_size) - new_h = np.round(image_height * im_scale) - new_w = np.round(image_width * im_scale) - - return int(new_h), int(new_w) - - def get_proposals( - self, rpn_cls_prob_reshape, bbox_deltas, image_size, _feat_stride=16 - ): - """ - Parameters - rpn_cls_prob_reshape: (H , W , Ax2), probabilities for predicted regions - bbox_deltas: (H , W , Ax4), predicted regions - image_size: a list of [image_height, image_width] - _feat_stride: the downsampling ratio of feature map to the original input image - Algorithm: - for each (H, W) location i - generate A anchor boxes centered on location i - apply predicted bbox deltas at location i to each of the A anchors - clip predicted boxes to image - remove predicted boxes with either height or width < threshold - sort all (proposal, score) pairs by score from highest to lowest - take top pre_nms_topN proposals before NMS - apply NMS with threshold to remaining proposals - take after_nms_top_n proposals after NMS - return the top proposals (-> RoIs top, scores top) - """ - - _anchors = self.anchors.copy() - _num_anchors = _anchors.shape[0] - height, width = rpn_cls_prob_reshape.shape[:2] - scores = np.reshape( - np.reshape(rpn_cls_prob_reshape, [height, width, _num_anchors, 2])[ - :, :, :, 1 - ], - [height, width, _num_anchors], - ) - shift_x = np.arange(0, width) * _feat_stride - shift_y = np.arange(0, height) * _feat_stride - shift_x, shift_y = np.meshgrid(shift_x, shift_y) - shifts = np.vstack( - (shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel()) - ).transpose() - _num_shifts = shifts.shape[0] - anchors = _anchors.reshape((1, _num_anchors, 4)) + shifts.reshape( - (1, _num_shifts, 4) - ).transpose((1, 0, 2)) - anchors = anchors.reshape((_num_shifts * _num_anchors, 4)) - # Transpose and reshape predicted bbox transformations to get them - # into the same order as the anchors: - # bbox deltas will be (4 * A, H, W) format - # transpose to (H, W, 4 * A) - # reshape to (H * W * A, 4) where rows are ordered by (h, w, a) - # in slowest to fastest order - bbox_deltas = bbox_deltas.reshape((-1, 4)) # (HxWxA, 4) - - # Same story for the scores: - scores = scores.reshape((-1, 1)) - - # Convert anchors into proposals via bbox transformations - proposals = self.bbox_transform_inv(anchors, bbox_deltas) - - # clip predicted boxes to image - proposals[:, :4].clip( - min=0, - max=( - image_size[1] - 1, - image_size[0] - 1, - image_size[1] - 1, - image_size[0] - 1, - ), - out=proposals[:, :4], - ) - # sort all (proposal, score) pairs by score from highest to lowest - order = scores.ravel().argsort()[::-1] - if self.pre_nms_top_n > 0: - order = order[: self.pre_nms_top_n] - proposals, scores = proposals[order, :], scores[order] - - # apply nms - keep = nms( - proposals[:, 0], - proposals[:, 1], - proposals[:, 2], - proposals[:, 3], - scores.reshape(-1), - self.iou_threshold, - include_boundaries=True, - ) - if self.post_nms_top_n > 0: - keep = keep[: self.post_nms_top_n] - proposals, scores = proposals[keep, :], scores[keep] - return proposals, scores - - def get_detections(self, text_proposals, scores, size): - keep_inds = np.where(scores > 0.7)[0] - text_proposals, scores = text_proposals[keep_inds], scores[keep_inds] - - sorted_indices = np.argsort(scores.ravel())[::-1] - text_proposals, scores = text_proposals[sorted_indices], scores[sorted_indices] - - text_recs = self.text_proposal_connector.get_text_lines( - text_proposals, scores, size - ) - - heights = ( - abs(text_recs[:, 5] - text_recs[:, 1]) - + abs(text_recs[:, 7] - text_recs[:, 3]) - ) / 2.0 + 1 - widths = ( - abs(text_recs[:, 2] - text_recs[:, 0]) - + abs(text_recs[:, 6] - text_recs[:, 4]) - ) / 2.0 + 1 - scores = text_recs[:, 8] - keep_inds = np.where( - (widths / heights > self.min_ratio) - & (scores > self.confidence_threshold) - & (widths > self.min_width) - )[0] - - return text_recs[keep_inds] - - @staticmethod - def bbox_transform_inv(boxes, deltas): - boxes = boxes.astype(deltas.dtype, copy=False) - - widths = boxes[:, 2] - boxes[:, 0] + 1.0 - heights = boxes[:, 3] - boxes[:, 1] + 1.0 - ctr_x = boxes[:, 0] + 0.5 * widths - ctr_y = boxes[:, 1] + 0.5 * heights - - dy = deltas[:, 1::4] - dh = deltas[:, 3::4] - - pred_ctr_x = ctr_x[:, np.newaxis] - pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis] - pred_w = widths[:, np.newaxis] - pred_h = np.exp(dh) * heights[:, np.newaxis] - - pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype) - pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w - pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h - pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w - pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h - - return pred_boxes - - -class Graph: - def __init__(self, graph): - self.graph = graph - - def sub_graphs_connected(self): - sub_graphs = [] - for index in range(self.graph.shape[0]): - if not self.graph[:, index].any() and self.graph[index, :].any(): - v = index - sub_graphs.append([v]) - while self.graph[v, :].any(): - v = np.where(self.graph[v, :])[0][0] - sub_graphs[-1].append(v) - - return sub_graphs - - -class TextProposalGraphBuilder: - """ - Build Text proposals into a graph. - """ - - def get_successions(self, index): - box = self.text_proposals[index] - results = [] - for left in range(int(box[0]) + 1, min(int(box[0]) + 50 + 1, self.im_size[1])): - adj_box_indices = self.boxes_table[left] - for adj_box_index in adj_box_indices: - if self.meet_v_iou(adj_box_index, index): - results.append(adj_box_index) - if results: - return results - return results - - def get_precursors(self, index): - box = self.text_proposals[index] - results = [] - for left in range(int(box[0]) - 1, max(int(box[0] - 50), 0) - 1, -1): - adj_box_indices = self.boxes_table[left] - for adj_box_index in adj_box_indices: - if self.meet_v_iou(adj_box_index, index): - results.append(adj_box_index) - if results: - return results - return results - - def is_succession_node(self, index, succession_index): - precursors = self.get_precursors(succession_index) - return self.scores[index] >= np.max(self.scores[precursors]) - - def meet_v_iou(self, index1, index2): - def overlaps_v(h1, h2, text_proposal1, text_proposal2): - y0 = max(text_proposal2[1], text_proposal1[1]) - y1 = min(text_proposal2[3], text_proposal1[3]) - return max(0, y1 - y0 + 1) / min(h1, h2) - - def size_similarity(h1, h2): - return min(h1, h2) / max(h1, h2) - - height_1 = self.heights[index1] - height_2 = self.heights[index2] - proposal_1 = self.text_proposals[index1] - proposal_2 = self.text_proposals[index2] - size_similarity_estimation = size_similarity(height_1, height_2) - vertical_overlap = overlaps_v(height_1, height_2, proposal_1, proposal_2) - - return vertical_overlap >= 0.7 and size_similarity_estimation >= 0.7 - - def build_graph(self, text_proposals, scores, im_size): - self.text_proposals = text_proposals - self.scores = scores - self.im_size = im_size - self.heights = text_proposals[:, 3] - text_proposals[:, 1] + 1 - - boxes_table = [[] for _ in range(self.im_size[1])] - for index, box in enumerate(text_proposals): - boxes_table[int(box[0])].append(index) - self.boxes_table = boxes_table - - graph = np.zeros((text_proposals.shape[0], text_proposals.shape[0]), bool) - - for index, box in enumerate(text_proposals): - successions = self.get_successions(index) - if not successions: - continue - succession_index = successions[np.argmax(scores[successions])] - if self.is_succession_node(index, succession_index): - graph[index, succession_index] = True - - return Graph(graph) - - -class TextProposalConnector: - def __init__(self): - self.graph_builder = TextProposalGraphBuilder() - - def group_text_proposals(self, text_proposals, scores, image_size): - graph = self.graph_builder.build_graph(text_proposals, scores, image_size) - return graph.sub_graphs_connected() - - def get_text_lines(self, text_proposals, scores, image_size): - def fit_y(x, y, x1, x2): - if np.sum(x == x[0]) == np.size(x): - return y[0], y[0] - p = np.poly1d(np.polyfit(x, y, 1)) - return p(x1), p(x2) - - tp_groups = self.group_text_proposals(text_proposals, scores, image_size) - - text_lines = np.zeros((len(tp_groups), 5), np.float32) - - for index, tp_indices in enumerate(tp_groups): - text_line_boxes = text_proposals[list(tp_indices)] - - x0 = np.min(text_line_boxes[:, 0]) - x1 = np.max(text_line_boxes[:, 2]) - - offset = (text_line_boxes[0, 2] - text_line_boxes[0, 0]) * 0.5 - - lt_y, rt_y = fit_y( - text_line_boxes[:, 0], text_line_boxes[:, 1], x0 + offset, x1 - offset - ) - lb_y, rb_y = fit_y( - text_line_boxes[:, 0], text_line_boxes[:, 3], x0 + offset, x1 - offset - ) - score = scores[list(tp_indices)].sum() / float(len(tp_indices)) - - text_lines[index, 0] = x0 - text_lines[index, 1] = min(lt_y, rt_y) - text_lines[index, 2] = x1 - text_lines[index, 3] = max(lb_y, rb_y) - text_lines[index, 4] = score - - text_lines[:, :4].clip( - min=0, - max=( - image_size[1] - 1, - image_size[0] - 1, - image_size[1] - 1, - image_size[0] - 1, - ), - out=text_lines[:, :4], - ) - - text_recs = np.zeros((len(text_lines), 9), float) - for index, line in enumerate(text_lines): - xmin, ymin, xmax, ymax = line[0], line[1], line[2], line[3] - ( - text_recs[index, 0], - text_recs[index, 1], - text_recs[index, 2], - text_recs[index, 3], - ) = (xmin, ymin, xmax, ymin) - ( - text_recs[index, 4], - text_recs[index, 5], - text_recs[index, 6], - text_recs[index, 7], - ) = (xmax, ymax, xmin, ymax) - text_recs[index, 8] = line[4] - - return text_recs diff --git a/model_api/python/model_api/models/deblurring.py b/model_api/python/model_api/models/deblurring.py deleted file mode 100644 index 2b0b0840..00000000 --- a/model_api/python/model_api/models/deblurring.py +++ /dev/null @@ -1,101 +0,0 @@ -""" - Copyright (c) 2021-2024 Intel Corporation - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import math - -import cv2 -import numpy as np - -from .image_model import ImageModel - - -class Deblurring(ImageModel): - __model__ = "Deblurring" - - def __init__(self, inference_adapter, configuration=dict(), preload=False): - super().__init__(inference_adapter, configuration, preload) - self._check_io_number(1, 1) - self.block_size = 32 - self.output_blob_name = self._get_outputs() - - @classmethod - def parameters(cls): - parameters = super().parameters() - return parameters - - def reshape(self, base_shape): - h, w, _ = base_shape - new_height = math.ceil(h / self.block_size) * self.block_size - new_width = math.ceil(w / self.block_size) * self.block_size - self.h, self.w = new_height, new_width - self.logger.debug( - "\tReshape model from {} to {}".format( - [self.n, self.c, h, w], [self.n, self.c, self.h, self.w] - ) - ) - super().reshape({self.image_blob_name: [self.n, self.c, self.h, self.w]}) - - def _get_outputs(self): - output_blob_name = next(iter(self.outputs)) - output_size = self.outputs[output_blob_name].shape - if len(output_size) != 4: - self.raise_error( - "Unexpected output blob shape {}. Only 4D output blob is supported".format( - output_size - ) - ) - - return output_blob_name - - def preprocess(self, inputs): - image = inputs - - if ( - self.h - self.block_size < image.shape[0] <= self.h - and self.w - self.block_size < image.shape[1] <= self.w - ): - pad_params = { - "mode": "constant", - "constant_values": 0, - "pad_width": ( - (0, self.h - image.shape[0]), - (0, self.w - image.shape[1]), - (0, 0), - ), - } - resized_image = np.pad(image, **pad_params) - else: - self.logger.warning( - "\tChosen model size doesn't match image size. The image is resized" - ) - resized_image = cv2.resize(image, (self.w, self.h)) - - resized_image = resized_image.transpose((2, 0, 1)) - resized_image = np.expand_dims(resized_image, 0) - dict_inputs = {self.image_blob_name: resized_image} - meta = {"original_shape": image.shape[1::-1]} - return dict_inputs, meta - - def postprocess(self, outputs, meta): - dsize = meta["original_shape"] - prediction = outputs[self.output_blob_name].squeeze() - prediction = prediction.transpose((1, 2, 0)) - if ( - self.h - self.block_size < dsize[1] <= self.h - and self.w - self.block_size < dsize[0] <= self.w - ): - prediction = prediction[: dsize[1], : dsize[0], :] - else: - prediction = cv2.resize(prediction, dsize) - prediction *= 255 - return prediction.astype(np.uint8) diff --git a/model_api/python/model_api/models/detr.py b/model_api/python/model_api/models/detr.py deleted file mode 100644 index 371ef7d1..00000000 --- a/model_api/python/model_api/models/detr.py +++ /dev/null @@ -1,96 +0,0 @@ -""" - Copyright (c) 2021-2024 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import numpy as np - -from .detection_model import DetectionModel -from .utils import Detection, softmax - - -class DETR(DetectionModel): - __model__ = "DETR" - - def __init__(self, inference_adapter, configuration=dict(), preload=False): - super().__init__(inference_adapter, configuration, preload) - self._check_io_number(1, 2) - self.bboxes_blob_name, self.scores_blob_name = self._get_outputs() - - def _get_outputs(self): - (bboxes_blob_name, bboxes_layer), ( - scores_blob_name, - scores_layer, - ) = self.outputs.items() - - if bboxes_layer.shape[1] != scores_layer.shape[1]: - self.raise_error( - "Expected the same second dimension for boxes and scores, but got {} and {}".format( - bboxes_layer.shape, scores_layer.shape - ) - ) - - if bboxes_layer.shape[2] == 4: - return bboxes_blob_name, scores_blob_name - elif scores_layer.shape[2] == 4: - return scores_blob_name, bboxes_blob_name - else: - self.raise_error( - "Expected shape [:,:,4] for bboxes output, but got {} and {}".format( - bboxes_layer.shape, scores_layer.shape - ) - ) - - @classmethod - def parameters(cls): - parameters = super().parameters() - parameters["resize_type"].update_default_value("standard") - parameters["confidence_threshold"].update_default_value(0.5) - return parameters - - def postprocess(self, outputs, meta): - detections = self._parse_outputs(outputs) - detections = self._resize_detections(detections, meta) - return detections - - def _parse_outputs(self, outputs): - boxes = outputs[self.bboxes_blob_name][0] - scores = outputs[self.scores_blob_name][0] - - x_mins, y_mins, x_maxs, y_maxs = self.box_cxcywh_to_xyxy(boxes) - - scores = np.array([softmax(logit) for logit in scores]) - labels = np.argmax(scores[:, :-1], axis=-1) - det_scores = np.max(scores[:, :-1], axis=-1) - - keep = det_scores > self.confidence_threshold - - detections = [ - Detection(*det) - for det in zip( - x_mins[keep], - y_mins[keep], - x_maxs[keep], - y_maxs[keep], - det_scores[keep], - labels[keep], - ) - ] - return detections - - @staticmethod - def box_cxcywh_to_xyxy(box): - x_c, y_c, w, h = box.T - b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)] - return b diff --git a/model_api/python/model_api/models/faceboxes.py b/model_api/python/model_api/models/faceboxes.py deleted file mode 100644 index c1e1cc95..00000000 --- a/model_api/python/model_api/models/faceboxes.py +++ /dev/null @@ -1,178 +0,0 @@ -""" - Copyright (c) 2020-2024 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import itertools -import math - -import numpy as np - -from .detection_model import DetectionModel -from .types import NumericalValue -from .utils import Detection, nms - - -class FaceBoxes(DetectionModel): - __model__ = "FaceBoxes" - - def __init__(self, inference_adapter, configuration=dict(), preload=False): - super().__init__(inference_adapter, configuration, preload) - self.bboxes_blob_name, self.scores_blob_name = self._get_outputs() - self.min_sizes = [[32, 64, 128], [256], [512]] - self.steps = [32, 64, 128] - self.variance = [0.1, 0.2] - self.keep_top_k = 750 - - def _get_outputs(self): - (bboxes_blob_name, bboxes_layer), ( - scores_blob_name, - scores_layer, - ) = self.outputs.items() - - if bboxes_layer.shape[1] != scores_layer.shape[1]: - self.raise_error( - "Expected the same second dimension for boxes and scores, but got {} and {}".format( - bboxes_layer.shape, scores_layer.shape - ) - ) - - if bboxes_layer.shape[2] == 4: - return bboxes_blob_name, scores_blob_name - - @classmethod - def parameters(cls): - parameters = super().parameters() - parameters.update( - { - "iou_threshold": NumericalValue( - default_value=0.3, - description="Threshold for non-maximum suppression (NMS) intersection over union (IOU) filtering", - ) - } - ) - parameters["labels"].update_default_value(["Face"]) - return parameters - - def postprocess(self, outputs, meta): - detections = self._parse_outputs(outputs, meta) - detections = self._resize_detections(detections, meta) - return detections - - def _parse_outputs(self, outputs, meta): - boxes = outputs[self.bboxes_blob_name][0] - scores = outputs[self.scores_blob_name][0] - - detections = [] - - feature_maps = [ - [math.ceil(self.h / step), math.ceil(self.w / step)] for step in self.steps - ] - prior_data = self.prior_boxes(feature_maps, [self.h, self.w]) - - boxes[:, :2] = self.variance[0] * boxes[:, :2] - boxes[:, 2:] = self.variance[1] * boxes[:, 2:] - boxes[:, :2] = boxes[:, :2] * prior_data[:, 2:] + prior_data[:, :2] - boxes[:, 2:] = np.exp(boxes[:, 2:]) * prior_data[:, 2:] - - score = np.transpose(scores)[1] - - mask = score > self.confidence_threshold - filtered_boxes, filtered_score = boxes[mask, :], score[mask] - if filtered_score.size != 0: - x_mins = filtered_boxes[:, 0] - 0.5 * filtered_boxes[:, 2] - y_mins = filtered_boxes[:, 1] - 0.5 * filtered_boxes[:, 3] - x_maxs = filtered_boxes[:, 0] + 0.5 * filtered_boxes[:, 2] - y_maxs = filtered_boxes[:, 1] + 0.5 * filtered_boxes[:, 3] - - keep = nms( - x_mins, - y_mins, - x_maxs, - y_maxs, - filtered_score, - self.iou_threshold, - keep_top_k=self.keep_top_k, - ) - - filtered_score = filtered_score[keep] - x_mins = x_mins[keep] - y_mins = y_mins[keep] - x_maxs = x_maxs[keep] - y_maxs = y_maxs[keep] - - if filtered_score.size > self.keep_top_k: - filtered_score = filtered_score[: self.keep_top_k] - x_mins = x_mins[: self.keep_top_k] - y_mins = y_mins[: self.keep_top_k] - x_maxs = x_maxs[: self.keep_top_k] - y_maxs = y_maxs[: self.keep_top_k] - - detections = [ - Detection(*det, 0) - for det in zip(x_mins, y_mins, x_maxs, y_maxs, filtered_score) - ] - return detections - - @staticmethod - def calculate_anchors(list_x, list_y, min_size, image_size, step): - anchors = [] - s_kx = min_size / image_size[1] - s_ky = min_size / image_size[0] - dense_cx = [x * step / image_size[1] for x in list_x] - dense_cy = [y * step / image_size[0] for y in list_y] - for cy, cx in itertools.product(dense_cy, dense_cx): - anchors.append([cx, cy, s_kx, s_ky]) - return anchors - - def calculate_anchors_zero_level(self, f_x, f_y, min_sizes, image_size, step): - anchors = [] - for min_size in min_sizes: - if min_size == 32: - list_x = [f_x + 0, f_x + 0.25, f_x + 0.5, f_x + 0.75] - list_y = [f_y + 0, f_y + 0.25, f_y + 0.5, f_y + 0.75] - elif min_size == 64: - list_x = [f_x + 0, f_x + 0.5] - list_y = [f_y + 0, f_y + 0.5] - else: - list_x = [f_x + 0.5] - list_y = [f_y + 0.5] - anchors.extend( - self.calculate_anchors(list_x, list_y, min_size, image_size, step) - ) - return anchors - - def prior_boxes(self, feature_maps, image_size): - anchors = [] - for k, f in enumerate(feature_maps): - for i, j in itertools.product(range(f[0]), range(f[1])): - if k == 0: - anchors.extend( - self.calculate_anchors_zero_level( - j, i, self.min_sizes[k], image_size, self.steps[k] - ) - ) - else: - anchors.extend( - self.calculate_anchors( - [j + 0.5], - [i + 0.5], - self.min_sizes[k][0], - image_size, - self.steps[k], - ) - ) - anchors = np.clip(anchors, 0, 1) - - return anchors diff --git a/model_api/python/model_api/models/hpe_associative_embedding.py b/model_api/python/model_api/models/hpe_associative_embedding.py deleted file mode 100644 index 1944104e..00000000 --- a/model_api/python/model_api/models/hpe_associative_embedding.py +++ /dev/null @@ -1,458 +0,0 @@ -""" - Copyright (C) 2020-2024 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import numpy as np -from model_api.adapters.utils import resize_image_ocv -from scipy.optimize import linear_sum_assignment - -from .image_model import ImageModel -from .types import NumericalValue, StringValue - - -class HpeAssociativeEmbedding(ImageModel): - __model__ = "HPE-assosiative-embedding" - - def __init__(self, inference_adapter, configuration=dict(), preload=False): - super().__init__(inference_adapter, configuration, preload=False) - self.heatmaps_blob_name = find_layer_by_name("heatmaps", self.outputs) - try: - self.nms_heatmaps_blob_name = find_layer_by_name( - "nms_heatmaps", self.outputs - ) - except ValueError: - self.nms_heatmaps_blob_name = self.heatmaps_blob_name - self.embeddings_blob_name = find_layer_by_name("embeddings", self.outputs) - self.output_scale = self.w / self.outputs[self.heatmaps_blob_name].shape[-1] - - if self.target_size is None: - self.target_size = min(self.h, self.w) - self.index_of_max_dimension = 0 - if self.aspect_ratio >= 1.0: # img width >= height - input_height, input_width = self.target_size, round( - self.target_size * self.aspect_ratio - ) - self.index_of_max_dimension = 1 - else: - input_height, input_width = ( - round(self.target_size / self.aspect_ratio), - self.target_size, - ) - self.h = ( - (input_height + self.size_divisor - 1) - // self.size_divisor - * self.size_divisor - ) - self.w = ( - (input_width + self.size_divisor - 1) - // self.size_divisor - * self.size_divisor - ) - default_input_shape = self.inputs[self.image_blob_name].shape - input_shape = {self.image_blob_name: [self.n, self.c, self.h, self.w]} - self.logger.debug( - "\tReshape model from {} to {}".format( - default_input_shape, input_shape[self.image_blob_name] - ) - ) - super().reshape(input_shape) - - if preload: - self.load() - - self.decoder = AssociativeEmbeddingDecoder( - num_joints=self.outputs[self.heatmaps_blob_name].shape[1], - adjust=True, - refine=True, - delta=self.delta, - max_num_people=30, - detection_threshold=0.1, - tag_threshold=1, - pose_threshold=self.confidence_threshold, - use_detection_val=True, - ignore_too_much=False, - dist_reweight=True, - ) - - @classmethod - def parameters(cls): - parameters = super().parameters() - parameters.update( - { - "target_size": NumericalValue( - value_type=int, - min=1, - description="Image resolution which is going to be processed. Reshapes network to match a given size", - ), - "aspect_ratio": NumericalValue( - description="Image aspect ratio which is going to be processed. Reshapes network to match a given size" - ), - "confidence_threshold": NumericalValue( - description="Pose confidence threshold" - ), - "delta": NumericalValue(default_value=0.0), - "size_divisor": NumericalValue( - default_value=32, - value_type=int, - description="Width and height of the rehaped model will be a multiple of this value", - ), - "padding_mode": StringValue( - default_value="right_bottom", choices=("center", "right_bottom") - ), - } - ) - return parameters - - def preprocess(self, inputs): - img = resize_image_ocv(inputs, (self.w, self.h), keep_aspect_ratio=True) - h, w = img.shape[:2] - if not ( - self.h - self.size_divisor < h <= self.h - and self.w - self.size_divisor < w <= self.w - ): - self.logger.warning( - "\tChosen model aspect ratio doesn't match image aspect ratio" - ) - resize_img_scale = np.array( - (inputs.shape[1] / w, inputs.shape[0] / h), np.float32 - ) - - if self.padding_mode == "center": - pad = ( - (self.h - h + 1) // 2, - (self.h - h) // 2, - (self.w - w + 1) // 2, - (self.w - w) // 2, - ) - else: - pad = (0, self.h - h, 0, self.w - w) - img = np.pad( - img, (pad[:2], pad[2:], (0, 0)), mode="constant", constant_values=0 - ) - img = img.transpose((2, 0, 1)) # Change data layout from HWC to CHW - img = img[None] - meta = {"original_size": inputs.shape[:2], "resize_img_scale": resize_img_scale} - return {self.image_blob_name: img}, meta - - def postprocess(self, outputs, meta): - heatmaps = outputs[self.heatmaps_blob_name] - nms_heatmaps = outputs[self.nms_heatmaps_blob_name] - aembds = outputs[self.embeddings_blob_name] - poses, scores = self.decoder(heatmaps, aembds, nms_heatmaps=nms_heatmaps) - # Rescale poses to the original image. - if self.padding_mode == "center": - scale = meta["resize_img_scale"][self.index_of_max_dimension] - poses[:, :, :2] *= scale * self.output_scale - shift = ( - meta["original_size"][self.index_of_max_dimension] - - max(self.h, self.w) * scale - ) / 2 - poses[:, :, 1 - self.index_of_max_dimension] += shift - else: - poses[:, :, :2] *= meta["resize_img_scale"] * self.output_scale - return poses, scores - - -def find_layer_by_name(name, layers): - suitable_layers = [] - for layer, metadata in layers.items(): - count_names = len( - [layer_name for layer_name in metadata.names if layer_name.startswith(name)] - ) - if count_names > 0: - suitable_layers.append(layer) - if not suitable_layers: - raise ValueError('Suitable layer for "{}" output is not found'.format(name)) - - if len(suitable_layers) > 1: - raise ValueError('More than 1 layer matched to "{}" output'.format(name)) - - return suitable_layers[0] - - -class Pose: - def __init__(self, num_joints, tag_size=1): - self.num_joints = num_joints - self.tag_size = tag_size - # 2 is for x, y and 1 is for joint confidence - self.pose = np.zeros((num_joints, 2 + 1 + tag_size), dtype=np.float32) - self.pose_tag = np.zeros(tag_size, dtype=np.float32) - self.valid_points_num = 0 - self.c = np.zeros(2, dtype=np.float32) - - def add(self, idx, joint, tag): - self.pose[idx] = joint - self.c = self.c * self.valid_points_num + joint[:2] - self.pose_tag = (self.pose_tag * self.valid_points_num) + tag - self.valid_points_num += 1 - self.c /= self.valid_points_num - self.pose_tag /= self.valid_points_num - - @property - def tag(self): - if self.valid_points_num > 0: - return self.pose_tag - return None - - @property - def center(self): - if self.valid_points_num > 0: - return self.c - return None - - -class AssociativeEmbeddingDecoder: - def __init__( - self, - num_joints, - max_num_people, - detection_threshold, - use_detection_val, - ignore_too_much, - tag_threshold, - pose_threshold, - adjust=True, - refine=True, - delta=0.0, - joints_order=None, - dist_reweight=True, - ): - self.num_joints = num_joints - self.max_num_people = max_num_people - self.detection_threshold = detection_threshold - self.tag_threshold = tag_threshold - self.pose_threshold = pose_threshold - self.use_detection_val = use_detection_val - self.ignore_too_much = ignore_too_much - - if self.num_joints == 17 and joints_order is None: - self.joint_order = ( - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 11, - 12, - 7, - 8, - 9, - 10, - 13, - 14, - 15, - 16, - ) - else: - self.joint_order = list(np.arange(self.num_joints)) - - self.do_adjust = adjust - self.do_refine = refine - self.dist_reweight = dist_reweight - self.delta = delta - - @staticmethod - def _max_match(scores): - r, c = linear_sum_assignment(scores) - return np.stack((r, c), axis=1) - - def _match_by_tag(self, inp): - tag_k, loc_k, val_k = inp - embd_size = tag_k.shape[2] - all_joints = np.concatenate((loc_k, val_k[..., None], tag_k), -1) - - poses = [] - for idx in self.joint_order: - tags = tag_k[idx] - joints = all_joints[idx] - mask = joints[:, 2] > self.detection_threshold - tags = tags[mask] - joints = joints[mask] - - if len(poses) == 0: - for tag, joint in zip(tags, joints): - pose = Pose(self.num_joints, embd_size) - pose.add(idx, joint, tag) - poses.append(pose) - continue - - if joints.shape[0] == 0 or ( - self.ignore_too_much and len(poses) == self.max_num_people - ): - continue - - poses_tags = np.stack([p.tag for p in poses], axis=0) - diff = tags[:, None] - poses_tags[None, :] - diff_normed = np.linalg.norm(diff, ord=2, axis=2) - diff_saved = np.copy(diff_normed) - - if self.dist_reweight: - # Reweight cost matrix to prefer nearby points among all that are close enough in a tag space. - centers = np.stack([p.center for p in poses], axis=0)[None] - dists = np.linalg.norm( - joints[:, :2][:, None, :] - centers, ord=2, axis=2 - ) - close_tags_masks = diff_normed < self.tag_threshold - min_dists = np.min(dists, axis=0, keepdims=True) - dists /= min_dists + 1e-10 - diff_normed[close_tags_masks] *= dists[close_tags_masks] - - if self.use_detection_val: - diff_normed = np.round(diff_normed) * 100 - joints[:, 2:3] - num_added = diff.shape[0] - num_grouped = diff.shape[1] - if num_added > num_grouped: - diff_normed = np.pad( - diff_normed, - ((0, 0), (0, num_added - num_grouped)), - mode="constant", - constant_values=1e10, - ) - - pairs = self._max_match(diff_normed) - for row, col in pairs: - if ( - row < num_added - and col < num_grouped - and diff_saved[row][col] < self.tag_threshold - ): - poses[col].add(idx, joints[row], tags[row]) - else: - pose = Pose(self.num_joints, embd_size) - pose.add(idx, joints[row], tags[row]) - poses.append(pose) - - ans = np.asarray([p.pose for p in poses], dtype=np.float32).reshape( - -1, self.num_joints, 2 + 1 + embd_size - ) - tags = np.asarray([p.tag for p in poses], dtype=np.float32).reshape( - -1, embd_size - ) - return ans, tags - - def top_k(self, heatmaps, tags): - N, K, H, W = heatmaps.shape - heatmaps = heatmaps.reshape(N, K, -1) - ind = heatmaps.argpartition(-self.max_num_people, axis=2)[ - :, :, -self.max_num_people : - ] - val_k = np.take_along_axis(heatmaps, ind, axis=2) - subind = np.argsort(-val_k, axis=2) - ind = np.take_along_axis(ind, subind, axis=2) - val_k = np.take_along_axis(val_k, subind, axis=2) - - tags = tags.reshape(N, K, W * H, -1) - tag_k = [ - np.take_along_axis(tags[..., i], ind, axis=2) for i in range(tags.shape[3]) - ] - tag_k = np.stack(tag_k, axis=3) - - x = ind % W - y = ind // W - loc_k = np.stack((x, y), axis=3) - return tag_k, loc_k, val_k - - @staticmethod - def adjust(ans, heatmaps): - H, W = heatmaps.shape[-2:] - for batch_idx, people in enumerate(ans): - for person in people: - for k, joint in enumerate(person): - heatmap = heatmaps[batch_idx, k] - px = int(joint[0]) - py = int(joint[1]) - if 1 < px < W - 1 and 1 < py < H - 1: - diff = np.array( - [ - heatmap[py, px + 1] - heatmap[py, px - 1], - heatmap[py + 1, px] - heatmap[py - 1, px], - ] - ) - joint[:2] += np.sign(diff) * 0.25 - return ans - - @staticmethod - def refine(heatmap, tag, keypoints, pose_tag=None): - K, H, W = heatmap.shape - if len(tag.shape) == 3: - tag = tag[..., None] - - if pose_tag is not None: - prev_tag = pose_tag - else: - tags = [] - for i in range(K): - if keypoints[i, 2] > 0: - x, y = keypoints[i][:2].astype(int) - tags.append(tag[i, y, x]) - prev_tag = np.mean(tags, axis=0) - - for i, (_heatmap, _tag) in enumerate(zip(heatmap, tag)): - if keypoints[i, 2] > 0: - continue - # Get position with the closest tag value to the pose tag. - diff = np.abs(_tag[..., 0] - prev_tag) + 0.5 - diff = diff.astype(np.int32).astype(_heatmap.dtype) - diff -= _heatmap - idx = diff.argmin() - y, x = np.divmod(idx, _heatmap.shape[-1]) - # Corresponding keypoint detection score. - val = _heatmap[y, x] - if val > 0: - keypoints[i, :3] = x, y, val - if 1 < x < W - 1 and 1 < y < H - 1: - diff = np.array( - [ - _heatmap[y, x + 1] - _heatmap[y, x - 1], - _heatmap[y + 1, x] - _heatmap[y - 1, x], - ] - ) - keypoints[i, :2] += np.sign(diff) * 0.25 - - return keypoints - - def __call__(self, heatmaps, tags, nms_heatmaps): - tag_k, loc_k, val_k = self.top_k(nms_heatmaps, tags) - ans = tuple( - map(self._match_by_tag, zip(tag_k, loc_k, val_k)) - ) # Call _match_by_tag() for each element in batch - ans, ans_tags = map(list, zip(*ans)) - - np.abs(heatmaps, out=heatmaps) - - if self.do_adjust: - ans = self.adjust(ans, heatmaps) - - if self.delta != 0.0: - for people in ans: - for person in people: - for joint in person: - joint[:2] += self.delta - - ans = ans[0] - scores = np.asarray([i[:, 2].mean() for i in ans]) - mask = scores > self.pose_threshold - ans = ans[mask] - scores = scores[mask] - - if self.do_refine: - heatmap_numpy = heatmaps[0] - tag_numpy = tags[0] - for i, pose in enumerate(ans): - ans[i] = self.refine(heatmap_numpy, tag_numpy, pose, ans_tags[0][i]) - - return ans, scores diff --git a/model_api/python/model_api/models/instance_segmentation.py b/model_api/python/model_api/models/instance_segmentation.py index 7242eb22..821b961f 100644 --- a/model_api/python/model_api/models/instance_segmentation.py +++ b/model_api/python/model_api/models/instance_segmentation.py @@ -282,193 +282,6 @@ def _segm_postprocess(box, raw_cls_mask, im_h, im_w): return im_mask -class YolactModel(ImageModel): - __model__ = "Yolact" - - def __init__(self, inference_adapter, configuration, preload=False): - super().__init__(inference_adapter, configuration, preload) - if self.path_to_labels: - self.labels = load_labels(self.path_to_labels) - self._check_io_number(1, 4) - self.output_blob_name = self._get_outputs() - - @classmethod - def parameters(cls): - parameters = super().parameters() - parameters.update( - { - "confidence_threshold": NumericalValue( - default_value=0.5, - description="Probability threshold for detections filtering", - ), - "labels": ListValue(description="List of class labels"), - "path_to_labels": StringValue( - description="Path to file with labels. Overrides the labels" - ), - } - ) - return parameters - - def _get_outputs(self): - outputs = {} - for layer_name in self.outputs: - layer_shape = self.outputs[layer_name].shape - if layer_name == "boxes" and len(layer_shape) == 3: - outputs["boxes"] = layer_name - elif layer_name == "conf" and len(layer_shape) == 3: - outputs["conf"] = layer_name - elif layer_name == "proto" and len(layer_shape) == 4: - outputs["proto"] = layer_name - elif layer_name == "mask" and len(layer_shape) == 3: - outputs["masks"] = layer_name - else: - self.raise_error( - "Unexpected output layer shape {} with name {}".format( - layer_shape, layer_name - ) - ) - return outputs - - def postprocess(self, outputs, meta): - frame_height, frame_width = meta["original_shape"][:-1] - input_height, input_width = meta["resized_shape"][:-1] - scale_x = meta["resized_shape"][1] / meta["original_shape"][1] - scale_y = meta["resized_shape"][0] / meta["original_shape"][0] - - boxes = outputs["boxes"][0] - conf = np.transpose(outputs["conf"][0]) - masks = outputs["mask"][0] - proto = outputs["proto"][0] - num_classes = conf.shape[0] - idx_lst, cls_lst, scr_lst = [], [], [] - shift_x = (input_width - (frame_width * scale_x)) / frame_width - shift_y = (input_height - (frame_height * scale_y)) / frame_height - - for cls in range(1, num_classes): - cls_scores = conf[cls, :] - idx = np.arange(cls_scores.shape[0]) - conf_mask = cls_scores > self.confidence_threshold - - cls_scores = cls_scores[conf_mask] - idx = idx[conf_mask] - - if cls_scores.shape[0] == 0: - continue - x1, x2 = self._sanitize_coordinates( - boxes[idx, 0], boxes[idx, 2], frame_width - ) - y1, y2 = self._sanitize_coordinates( - boxes[idx, 1], boxes[idx, 3], frame_height - ) - keep = nms(x1, y1, x2, y2, cls_scores, 0.5) - - idx_lst.append(idx[keep]) - cls_lst.append(np.full(len(keep), cls)) - scr_lst.append(cls_scores[keep]) - - if not idx_lst: - return np.array([]), np.array([]), np.array([]), np.array([]) - idx = np.concatenate(idx_lst, axis=0) - classes = np.concatenate(cls_lst, axis=0) - scores = np.concatenate(scr_lst, axis=0) - - idx2 = np.argsort(scores, axis=0)[::-1] - scores = scores[idx2] - - idx = idx[idx2] - classes = classes[idx2] - - boxes = boxes[idx] - masks = masks[idx] - if np.size(boxes) > 0: - boxes, scores, classes, masks = self._segm_postprocess( - boxes, - masks, - scores, - classes, - proto, - frame_width, - frame_height, - shift_x=shift_x, - shift_y=shift_y, - ) - return scores, classes, boxes, masks - - def _segm_postprocess( - self, boxes, masks, score, classes, proto_data, w, h, shift_x=0, shift_y=0 - ): - if self.confidence_threshold > 0: - keep = score > self.confidence_threshold - score = score[keep] - boxes = boxes[keep] - masks = masks[keep] - classes = classes[keep] - if np.size(score) == 0: - return [] * 4 - - masks = proto_data @ masks.T - masks = 1 / (1 + np.exp(-masks)) - masks = self._crop_mask(masks, boxes) - - masks = np.transpose(masks, (2, 0, 1)) - boxes[:, 0], boxes[:, 2] = self._sanitize_coordinates( - boxes[:, 0], boxes[:, 2], w, shift_x - ) - boxes[:, 1], boxes[:, 3] = self._sanitize_coordinates( - boxes[:, 1], boxes[:, 3], h, shift_y - ) - ready_masks = [] - - for mask in masks: - mask = cv2.resize(mask, (w, h), cv2.INTER_LINEAR) - mask = mask > 0.5 - ready_masks.append(mask.astype(np.uint8)) - - return boxes, score, classes, ready_masks - - def _crop_mask(self, masks, boxes, padding: int = 1): - h, w, n = np.shape(masks) - x1, x2 = self._sanitize_coordinates( - boxes[:, 0], boxes[:, 2], w, padding=padding - ) - y1, y2 = self._sanitize_coordinates( - boxes[:, 1], boxes[:, 3], h, padding=padding - ) - - rows = np.reshape( - np.repeat( - np.reshape(np.repeat(np.arange(w, dtype=x1.dtype), h), (w, h)), - n, - axis=-1, - ), - (h, w, n), - ) - cols = np.reshape( - np.repeat( - np.reshape(np.repeat(np.arange(h, dtype=x1.dtype), h), (w, h)), - n, - axis=-1, - ), - (h, w, n), - ) - rows = np.transpose(rows, (1, 0, 2)) - - masks_left = rows >= x1 - masks_right = rows < x2 - masks_up = cols >= y1 - masks_down = cols < y2 - crop_mask = masks_left * masks_right * masks_up * masks_down - return masks * crop_mask - - @staticmethod - def _sanitize_coordinates(_x1, _x2, img_size, shift=0, padding=0): - _x1 = (_x1 + shift / 2) * img_size - _x2 = (_x2 + shift / 2) * img_size - x1 = np.clip(_x1 - padding, 0, img_size) - x2 = np.clip(_x2 + padding, 0, img_size) - return x1, x2 - - _saliency_map_name = "saliency_map" _feature_vector_name = "feature_vector" diff --git a/model_api/python/model_api/models/monodepth.py b/model_api/python/model_api/models/monodepth.py deleted file mode 100644 index a8c94152..00000000 --- a/model_api/python/model_api/models/monodepth.py +++ /dev/null @@ -1,43 +0,0 @@ -""" - Copyright (C) 2018-2024 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import cv2 - -from .segmentation import SegmentationModel - - -class MonoDepthModel(SegmentationModel): - __model__ = "MonoDepth" - - def postprocess(self, outputs, meta): - result = outputs[self.output_blob_name].squeeze() - input_image_height = meta["original_shape"][0] - input_image_width = meta["original_shape"][1] - - result = cv2.resize( - result, - (input_image_width, input_image_height), - interpolation=cv2.INTER_CUBIC, - ) - - disp_min = result.min() - disp_max = result.max() - if disp_max - disp_min > 1e-6: - result = (result - disp_min) / (disp_max - disp_min) - else: - result.fill(0.5) - - return result diff --git a/model_api/python/model_api/models/nanodet.py b/model_api/python/model_api/models/nanodet.py deleted file mode 100644 index 31486ab4..00000000 --- a/model_api/python/model_api/models/nanodet.py +++ /dev/null @@ -1,177 +0,0 @@ -""" - Copyright (c) 2022-2024 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import math - -import numpy as np - -from .detection_model import DetectionModel -from .types import NumericalValue -from .utils import Detection, clip_detections, nms, softmax - - -class NanoDet(DetectionModel): - __model__ = "NanoDet" - - def __init__(self, inference_adapter, configuration=dict(), preload=False): - super().__init__(inference_adapter, configuration, preload) - self._check_io_number(1, 1) - self.output_blob_name = self._get_outputs() - self.reg_max = 7 - self.strides = [8, 16, 32] - self.ad = 0.5 - - def _get_outputs(self): - output_blob_name = next(iter(self.outputs)) - output_size = self.outputs[output_blob_name].shape - if len(output_size) != 3: - self.raise_error( - "Unexpected output blob shape {}. Only 3D output blob is supported".format( - output_size - ) - ) - - return output_blob_name - - @classmethod - def parameters(cls): - parameters = super().parameters() - parameters["resize_type"].update_default_value("fit_to_window") - parameters["confidence_threshold"].update_default_value(0.5) - parameters.update( - { - "iou_threshold": NumericalValue( - default_value=0.6, - description="Threshold for non-maximum suppression (NMS) intersection over union (IOU) filtering", - ), - "num_classes": NumericalValue( - default_value=80, value_type=int, description="Number of classes" - ), - } - ) - return parameters - - def postprocess(self, outputs, meta): - detections = self._parse_outputs(outputs, meta) - detections = self.rescale_detections(detections, meta) - return detections - - def _parse_outputs(self, outputs, meta): - output = outputs[self.output_blob_name][0] - - cls_scores = output[:, : self.num_classes] - bbox_preds = output[:, self.num_classes :] - input_height, input_width = ( - meta["padded_shape"][:2] - if meta.get("padded_shape") - else meta["resized_shape"][:2] - ) - - bboxes = self.get_bboxes(bbox_preds, input_height, input_width) - dets = [] - for label, score in enumerate(np.transpose(cls_scores)): - mask = score > self.confidence_threshold - filtered_boxes, score = bboxes[mask, :], score[mask] - if score.size == 0: - continue - x_mins, y_mins, x_maxs, y_maxs = filtered_boxes.T - keep = nms( - x_mins, - y_mins, - x_maxs, - y_maxs, - score, - self.iou_threshold, - include_boundaries=True, - ) - score = score[keep] - x_mins, y_mins, x_maxs, y_maxs = ( - x_mins[keep], - y_mins[keep], - x_maxs[keep], - y_maxs[keep], - ) - labels = np.full_like(score, label, dtype=int) - dets += [ - Detection(*det) - for det in zip(x_mins, y_mins, x_maxs, y_maxs, score, labels) - ] - return dets - - @staticmethod - def distance2bbox(points, distance, max_shape): - x1 = np.expand_dims(points[:, 0] - distance[:, 0], -1).clip(0, max_shape[1]) - y1 = np.expand_dims(points[:, 1] - distance[:, 1], -1).clip(0, max_shape[0]) - x2 = np.expand_dims(points[:, 0] + distance[:, 2], -1).clip(0, max_shape[1]) - y2 = np.expand_dims(points[:, 1] + distance[:, 3], -1).clip(0, max_shape[0]) - return np.concatenate((x1, y1, x2, y2), axis=-1) - - def get_single_level_center_point(self, featmap_size, stride): - h, w = featmap_size - x_range, y_range = (np.arange(w) + self.ad) * stride, ( - np.arange(h) + self.ad - ) * stride - y, x = np.meshgrid(y_range, x_range, indexing="ij") - return y.flatten(), x.flatten() - - def get_bboxes(self, reg_preds, input_height, input_width): - featmap_sizes = [ - (math.ceil(input_height / stride), math.ceil(input_width) / stride) - for stride in self.strides - ] - list_center_priors = [] - for stride, featmap_size in zip(self.strides, featmap_sizes): - y, x = self.get_single_level_center_point(featmap_size, stride) - strides = np.full_like(x, stride) - list_center_priors.append(np.stack([x, y, strides, strides], axis=-1)) - center_priors = np.concatenate(list_center_priors, axis=0) - dist_project = np.linspace(0, self.reg_max, self.reg_max + 1) - x = np.dot( - softmax( - np.reshape(reg_preds, (*reg_preds.shape[:-1], 4, self.reg_max + 1)), - -1, - True, - ), - dist_project, - ) - dis_preds = x * np.expand_dims(center_priors[:, 2], -1) - return self.distance2bbox( - center_priors[:, :2], dis_preds, (input_height, input_width) - ) - - @staticmethod - def rescale_detections(detections, meta): - input_h, input_w, _ = meta["resized_shape"] - orig_h, orig_w, _ = meta["original_shape"] - w = orig_w / input_w - h = orig_h / input_h - - for detection in detections: - detection.xmin *= w - detection.xmax *= w - detection.ymin *= h - detection.ymax *= h - - return clip_detections(detections, meta["original_shape"]) - - -class NanoDetPlus(NanoDet): - __model__ = "NanoDet-Plus" - - def __init__(self, inference_adapter, configuration=dict(), preload=False): - super().__init__(inference_adapter, configuration, preload) - self.ad = 0 - self.strides = [8, 16, 32, 64] diff --git a/model_api/python/model_api/models/open_pose.py b/model_api/python/model_api/models/open_pose.py deleted file mode 100644 index b812e8a9..00000000 --- a/model_api/python/model_api/models/open_pose.py +++ /dev/null @@ -1,531 +0,0 @@ -""" - Copyright (C) 2020-2024 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import cv2 -import numpy as np - -try: - from numpy.core.umath import clip -except ImportError: - from numpy import clip - -import openvino.runtime.opset8 as opset8 - -from .image_model import ImageModel -from .types import NumericalValue - - -class OpenPose(ImageModel): - __model__ = "OpenPose" - - def __init__(self, inference_adapter, configuration=dict(), preload=False): - super().__init__(inference_adapter, configuration, preload=False) - self.pooled_heatmaps_blob_name = "pooled_heatmaps" - self.heatmaps_blob_name = "heatmaps" - self.pafs_blob_name = "pafs" - - function = self.inference_adapter.model - paf = function.get_output_op(0) - paf_shape = paf.output(0).get_shape() - heatmap = function.get_output_op(1) - - heatmap_shape = heatmap.output(0).get_shape() - if len(paf_shape) != 4 and len(heatmap_shape) != 4: - self.raise_error("OpenPose outputs must be 4-dimensional") - if paf_shape[2] != heatmap_shape[2] and paf_shape[3] != heatmap_shape[3]: - self.raise_error("Last two dimensions of OpenPose outputs must match") - if paf_shape[1] * 2 == heatmap_shape[1]: - paf, heatmap = heatmap, paf - elif paf_shape[1] != heatmap_shape[1] * 2: - self.raise_error( - "Size of second dimension of OpenPose of one output must be two times larger then size " - "of second dimension of another output" - ) - - paf = paf.inputs()[0].get_source_output().get_node() - paf.get_output_tensor(0).set_names({self.pafs_blob_name}) - heatmap = heatmap.inputs()[0].get_source_output().get_node() - - heatmap.get_output_tensor(0).set_names({self.heatmaps_blob_name}) - - # Add keypoints NMS to the network. - # Heuristic NMS kernel size adjustment depending on the feature maps upsampling ratio. - p = int(np.round(6 / 7 * self.upsample_ratio)) - k = 2 * p + 1 - pooled_heatmap = opset8.max_pool( - heatmap, - kernel_shape=(k, k), - dilations=(1, 1), - pads_begin=(p, p), - pads_end=(p, p), - strides=(1, 1), - name=self.pooled_heatmaps_blob_name, - ) - pooled_heatmap.output(0).get_tensor().set_names( - {self.pooled_heatmaps_blob_name} - ) - self.inference_adapter.model.add_outputs([pooled_heatmap.output(0)]) - - self.inputs = self.inference_adapter.get_input_layers() - self.outputs = self.inference_adapter.get_output_layers() - - self.output_scale = ( - self.inputs[self.image_blob_name].shape[-2] - / self.outputs[self.heatmaps_blob_name].shape[-2] - ) - - if self.target_size is None: - self.target_size = self.inputs[self.image_blob_name].shape[-2] - self.h = ( - (self.target_size + self.size_divisor - 1) - // self.size_divisor - * self.size_divisor - ) - input_width = round(self.target_size * self.aspect_ratio) - self.w = ( - (input_width + self.size_divisor - 1) - // self.size_divisor - * self.size_divisor - ) - default_input_shape = self.inputs[self.image_blob_name].shape - input_shape = { - self.image_blob_name: (default_input_shape[:-2] + [self.h, self.w]) - } - self.logger.debug( - "\tReshape model from {} to {}".format( - default_input_shape, input_shape[self.image_blob_name] - ) - ) - super().reshape(input_shape) - - if preload: - self.load() - - num_joints = ( - self.outputs[self.heatmaps_blob_name].shape[1] - 1 - ) # The last channel is for background - self.decoder = OpenPoseDecoder( - num_joints, score_threshold=self.confidence_threshold - ) - - @classmethod - def parameters(cls): - parameters = super().parameters() - parameters.update( - { - "target_size": NumericalValue( - value_type=int, - min=1, - description="Image resolution which is going to be processed. Reshapes network to match a given size", - ), - "aspect_ratio": NumericalValue( - description="Image aspect ratio which is going to be processed. Reshapes network to match a given size" - ), - "confidence_threshold": NumericalValue( - description="pose confidence threshold" - ), - "upsample_ratio": NumericalValue( - default_value=1, - value_type=int, - description="Upsample ratio of a model backbone", - ), - "size_divisor": NumericalValue( - default_value=8, - value_type=int, - description="Width and height of the rehaped model will be a multiple of this value", - ), - } - ) - return parameters - - @staticmethod - def heatmap_nms(heatmaps, pooled_heatmaps): - return heatmaps * (heatmaps == pooled_heatmaps) - - @staticmethod - def _resize_image(frame, input_h): - h = frame.shape[0] - scale = input_h / h - return cv2.resize(frame, None, fx=scale, fy=scale) - - def preprocess(self, inputs): - img = self._resize_image(inputs, self.h) - h, w = img.shape[:2] - if self.w < w: - self.raise_error("The image aspect ratio doesn't fit current model shape") - if not (self.w - self.size_divisor < w <= self.w): - self.logger.warning( - "\tChosen model aspect ratio doesn't match image aspect ratio" - ) - resize_img_scale = np.array( - (inputs.shape[1] / w, inputs.shape[0] / h), np.float32 - ) - - img = np.pad( - img, ((0, 0), (0, self.w - w), (0, 0)), mode="constant", constant_values=0 - ) - img = img.transpose((2, 0, 1)) # Change data layout from HWC to CHW - img = img[None] - meta = {"resize_img_scale": resize_img_scale} - return {self.image_blob_name: img}, meta - - def postprocess(self, outputs, meta): - heatmaps = outputs[self.heatmaps_blob_name] - pafs = outputs[self.pafs_blob_name] - pooled_heatmaps = outputs[self.pooled_heatmaps_blob_name] - nms_heatmaps = self.heatmap_nms(heatmaps, pooled_heatmaps) - poses, scores = self.decoder(heatmaps, nms_heatmaps, pafs) - # Rescale poses to the original image. - poses[:, :, :2] *= meta["resize_img_scale"] * self.output_scale - return poses, scores - - -class OpenPoseDecoder: - BODY_PARTS_KPT_IDS = ( - (1, 2), - (1, 5), - (2, 3), - (3, 4), - (5, 6), - (6, 7), - (1, 8), - (8, 9), - (9, 10), - (1, 11), - (11, 12), - (12, 13), - (1, 0), - (0, 14), - (14, 16), - (0, 15), - (15, 17), - (2, 16), - (5, 17), - ) - BODY_PARTS_PAF_IDS = ( - 12, - 20, - 14, - 16, - 22, - 24, - 0, - 2, - 4, - 6, - 8, - 10, - 28, - 30, - 34, - 32, - 36, - 18, - 26, - ) - - def __init__( - self, - num_joints=18, - skeleton=BODY_PARTS_KPT_IDS, - paf_indices=BODY_PARTS_PAF_IDS, - max_points=100, - score_threshold=0.1, - min_paf_alignment_score=0.05, - delta=0.5, - ): - self.num_joints = num_joints - self.skeleton = skeleton - self.paf_indices = paf_indices - self.max_points = max_points - self.score_threshold = score_threshold - self.min_paf_alignment_score = min_paf_alignment_score - self.delta = delta - - self.points_per_limb = 10 - self.grid = np.arange(self.points_per_limb, dtype=np.float32).reshape(1, -1, 1) - - def __call__(self, heatmaps, nms_heatmaps, pafs): - batch_size, _, h, w = heatmaps.shape - assert batch_size == 1, "Batch size of 1 only supported" - - keypoints = self.extract_points(heatmaps, nms_heatmaps) - pafs = np.transpose(pafs, (0, 2, 3, 1)) - - if self.delta > 0: - for kpts in keypoints: - kpts[:, :2] += self.delta - clip(kpts[:, 0], 0, w - 1, out=kpts[:, 0]) - clip(kpts[:, 1], 0, h - 1, out=kpts[:, 1]) - - pose_entries, keypoints = self.group_keypoints( - keypoints, pafs, pose_entry_size=self.num_joints + 2 - ) - poses, scores = self.convert_to_coco_format(pose_entries, keypoints) - if len(poses) > 0: - poses = np.asarray(poses, dtype=np.float32) - poses = poses.reshape((poses.shape[0], -1, 3)) - else: - poses = np.empty((0, 17, 3), dtype=np.float32) - scores = np.empty(0, dtype=np.float32) - - return poses, scores - - def extract_points(self, heatmaps, nms_heatmaps): - batch_size, channels_num, h, w = heatmaps.shape - assert batch_size == 1, "Batch size of 1 only supported" - assert channels_num >= self.num_joints - - xs, ys, scores = self.top_k(nms_heatmaps) - masks = scores > self.score_threshold - all_keypoints = [] - keypoint_id = 0 - for k in range(self.num_joints): - # Filter low-score points. - mask = masks[0, k] - x = xs[0, k][mask].ravel() - y = ys[0, k][mask].ravel() - score = scores[0, k][mask].ravel() - n = len(x) - if n == 0: - all_keypoints.append(np.empty((0, 4), dtype=np.float32)) - continue - # Apply quarter offset to improve localization accuracy. - x, y = self.refine(heatmaps[0, k], x, y) - clip(x, 0, w - 1, out=x) - clip(y, 0, h - 1, out=y) - # Pack resulting points. - keypoints = np.empty((n, 4), dtype=np.float32) - keypoints[:, 0] = x - keypoints[:, 1] = y - keypoints[:, 2] = score - keypoints[:, 3] = np.arange(keypoint_id, keypoint_id + n) - keypoint_id += n - all_keypoints.append(keypoints) - return all_keypoints - - def top_k(self, heatmaps): - N, K, _, W = heatmaps.shape - heatmaps = heatmaps.reshape(N, K, -1) - # Get positions with top scores. - ind = heatmaps.argpartition(-self.max_points, axis=2)[:, :, -self.max_points :] - scores = np.take_along_axis(heatmaps, ind, axis=2) - # Keep top scores sorted. - subind = np.argsort(-scores, axis=2) - ind = np.take_along_axis(ind, subind, axis=2) - scores = np.take_along_axis(scores, subind, axis=2) - y, x = np.divmod(ind, W) - return x, y, scores - - @staticmethod - def refine(heatmap, x, y): - h, w = heatmap.shape[-2:] - valid = np.logical_and( - np.logical_and(x > 0, x < w - 1), np.logical_and(y > 0, y < h - 1) - ) - xx = x[valid] - yy = y[valid] - dx = np.sign(heatmap[yy, xx + 1] - heatmap[yy, xx - 1], dtype=np.float32) * 0.25 - dy = np.sign(heatmap[yy + 1, xx] - heatmap[yy - 1, xx], dtype=np.float32) * 0.25 - x = x.astype(np.float32) - y = y.astype(np.float32) - x[valid] += dx - y[valid] += dy - return x, y - - @staticmethod - def is_disjoint(pose_a, pose_b): - pose_a = pose_a[:-2] - pose_b = pose_b[:-2] - return np.all(np.logical_or.reduce((pose_a == pose_b, pose_a < 0, pose_b < 0))) - - def update_poses( - self, - kpt_a_id, - kpt_b_id, - all_keypoints, - connections, - pose_entries, - pose_entry_size, - ): - for connection in connections: - pose_a_idx = -1 - pose_b_idx = -1 - for j, pose in enumerate(pose_entries): - if pose[kpt_a_id] == connection[0]: - pose_a_idx = j - if pose[kpt_b_id] == connection[1]: - pose_b_idx = j - if pose_a_idx < 0 and pose_b_idx < 0: - # Create new pose entry. - pose_entry = np.full(pose_entry_size, -1, dtype=np.float32) - pose_entry[kpt_a_id] = connection[0] - pose_entry[kpt_b_id] = connection[1] - pose_entry[-1] = 2 - pose_entry[-2] = ( - np.sum(all_keypoints[connection[0:2], 2]) + connection[2] - ) - pose_entries.append(pose_entry) - elif pose_a_idx >= 0 and pose_b_idx >= 0 and pose_a_idx != pose_b_idx: - # Merge two poses are disjoint merge them, otherwise ignore connection. - pose_a = pose_entries[pose_a_idx] - pose_b = pose_entries[pose_b_idx] - if self.is_disjoint(pose_a, pose_b): - pose_a += pose_b - pose_a[:-2] += 1 - pose_a[-2] += connection[2] - del pose_entries[pose_b_idx] - elif pose_a_idx >= 0 and pose_b_idx >= 0: - # Adjust score of a pose. - pose_entries[pose_a_idx][-2] += connection[2] - elif pose_a_idx >= 0: - # Add a new limb into pose. - pose = pose_entries[pose_a_idx] - if pose[kpt_b_id] < 0: - pose[-2] += all_keypoints[connection[1], 2] - pose[kpt_b_id] = connection[1] - pose[-2] += connection[2] - pose[-1] += 1 - elif pose_b_idx >= 0: - # Add a new limb into pose. - pose = pose_entries[pose_b_idx] - if pose[kpt_a_id] < 0: - pose[-2] += all_keypoints[connection[0], 2] - pose[kpt_a_id] = connection[0] - pose[-2] += connection[2] - pose[-1] += 1 - return pose_entries - - @staticmethod - def connections_nms(a_idx, b_idx, affinity_scores): - # From all retrieved connections that share starting/ending keypoints leave only the top-scoring ones. - order = affinity_scores.argsort()[::-1] - affinity_scores = affinity_scores[order] - a_idx = a_idx[order] - b_idx = b_idx[order] - idx = [] - has_kpt_a = set() - has_kpt_b = set() - for t, (i, j) in enumerate(zip(a_idx, b_idx)): - if i not in has_kpt_a and j not in has_kpt_b: - idx.append(t) - has_kpt_a.add(i) - has_kpt_b.add(j) - idx = np.asarray(idx, dtype=np.int32) - return a_idx[idx], b_idx[idx], affinity_scores[idx] - - def group_keypoints(self, all_keypoints_by_type, pafs, pose_entry_size=20): - all_keypoints = np.concatenate(all_keypoints_by_type, axis=0) - pose_entries = [] - # For every limb. - for part_id, paf_channel in enumerate(self.paf_indices): - kpt_a_id, kpt_b_id = self.skeleton[part_id] - kpts_a = all_keypoints_by_type[kpt_a_id] - kpts_b = all_keypoints_by_type[kpt_b_id] - n = len(kpts_a) - m = len(kpts_b) - if n == 0 or m == 0: - continue - - # Get vectors between all pairs of keypoints, i.e. candidate limb vectors. - a = kpts_a[:, :2] - a = np.broadcast_to(a[None], (m, n, 2)) - b = kpts_b[:, :2] - vec_raw = (b[:, None, :] - a).reshape(-1, 1, 2) - - # Sample points along every candidate limb vector. - steps = 1 / (self.points_per_limb - 1) * vec_raw - points = steps * self.grid + a.reshape(-1, 1, 2) - points = points.round().astype(dtype=np.int32) - x = points[..., 0].ravel() - y = points[..., 1].ravel() - - # Compute affinity score between candidate limb vectors and part affinity field. - part_pafs = pafs[0, :, :, paf_channel : paf_channel + 2] - field = part_pafs[y, x].reshape(-1, self.points_per_limb, 2) - vec_norm = np.linalg.norm(vec_raw, ord=2, axis=-1, keepdims=True) - vec = vec_raw / (vec_norm + 1e-6) - affinity_scores = (field * vec).sum(-1).reshape(-1, self.points_per_limb) - valid_affinity_scores = affinity_scores > self.min_paf_alignment_score - valid_num = valid_affinity_scores.sum(1) - affinity_scores = (affinity_scores * valid_affinity_scores).sum(1) / ( - valid_num + 1e-6 - ) - success_ratio = valid_num / self.points_per_limb - - # Get a list of limbs according to the obtained affinity score. - valid_limbs = np.where( - np.logical_and(affinity_scores > 0, success_ratio > 0.8) - )[0] - if len(valid_limbs) == 0: - continue - b_idx, a_idx = np.divmod(valid_limbs, n) - affinity_scores = affinity_scores[valid_limbs] - - # Suppress incompatible connections. - a_idx, b_idx, affinity_scores = self.connections_nms( - a_idx, b_idx, affinity_scores - ) - connections = list( - zip( - kpts_a[a_idx, 3].astype(np.int32), - kpts_b[b_idx, 3].astype(np.int32), - affinity_scores, - ) - ) - if len(connections) == 0: - continue - - # Update poses with new connections. - pose_entries = self.update_poses( - kpt_a_id, - kpt_b_id, - all_keypoints, - connections, - pose_entries, - pose_entry_size, - ) - - # Remove poses with not enough points. - pose_entries = np.asarray(pose_entries, dtype=np.float32).reshape( - -1, pose_entry_size - ) - pose_entries = pose_entries[pose_entries[:, -1] >= 3] - return pose_entries, all_keypoints - - @staticmethod - def convert_to_coco_format(pose_entries, all_keypoints): - num_joints = 17 - coco_keypoints = [] - scores = [] - for pose in pose_entries: - if len(pose) == 0: - continue - keypoints = np.zeros(num_joints * 3) - reorder_map = [0, -1, 6, 8, 10, 5, 7, 9, 12, 14, 16, 11, 13, 15, 2, 1, 4, 3] - person_score = pose[-2] - for keypoint_id, target_id in zip(pose[:-2], reorder_map): - if target_id < 0: - continue - cx, cy, score = 0, 0, 0 # keypoint not found - if keypoint_id != -1: - cx, cy, score = all_keypoints[int(keypoint_id), 0:3] - keypoints[target_id * 3 + 0] = cx - keypoints[target_id * 3 + 1] = cy - keypoints[target_id * 3 + 2] = score - coco_keypoints.append(keypoints) - scores.append(person_score * max(0, (pose[-1] - 1))) # -1 for 'neck' - return np.asarray(coco_keypoints), np.asarray(scores) diff --git a/model_api/python/model_api/models/retinaface.py b/model_api/python/model_api/models/retinaface.py deleted file mode 100644 index 854482a5..00000000 --- a/model_api/python/model_api/models/retinaface.py +++ /dev/null @@ -1,565 +0,0 @@ -""" - Copyright (C) 2020-2024 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import re -from itertools import product as product - -import numpy as np - -from .detection_model import DetectionModel -from .utils import Detection, DetectionWithLandmarks, clip_detections, nms - - -class RetinaFace(DetectionModel): - __model__ = "RetinaFace" - - def __init__(self, inference_adapter, configuration=dict(), preload=False): - super().__init__(inference_adapter, configuration, preload) - self._check_io_number(1, (6, 9, 12)) - - self.detect_masks = len(self.outputs) == 12 - self.process_landmarks = len(self.outputs) > 6 - self.mask_threshold = 0.5 - self.postprocessor = RetinaFacePostprocessor( - detect_attributes=self.detect_masks, - process_landmarks=self.process_landmarks, - ) - - self.labels = ["Face"] if not self.detect_masks else ["Mask", "No mask"] - - @classmethod - def parameters(cls): - parameters = super().parameters() - parameters["resize_type"].update_default_value("standard") - parameters["confidence_threshold"].update_default_value(0.5) - return parameters - - def postprocess(self, outputs, meta): - scale_x = meta["resized_shape"][1] / meta["original_shape"][1] - scale_y = meta["resized_shape"][0] / meta["original_shape"][0] - - outputs = self.postprocessor.process_output( - outputs, scale_x, scale_y, self.confidence_threshold, self.mask_threshold - ) - return clip_detections(outputs, meta["original_shape"]) - - -class RetinaFacePyTorch(DetectionModel): - __model__ = "RetinaFace-PyTorch" - - def __init__(self, inference_adapter, configuration=dict(), preload=False): - super().__init__(inference_adapter, configuration, preload) - self._check_io_number(1, (2, 3)) - - self.process_landmarks = len(self.outputs) == 3 - self.postprocessor = RetinaFacePyTorchPostprocessor( - process_landmarks=self.process_landmarks - ) - - @classmethod - def parameters(cls): - parameters = super().parameters() - parameters["resize_type"].update_default_value("standard") - parameters["confidence_threshold"].update_default_value(0.5) - parameters["labels"].update_default_value(["Face"]) - return parameters - - def postprocess(self, outputs, meta): - scale_x = meta["resized_shape"][1] / meta["original_shape"][1] - scale_y = meta["resized_shape"][0] / meta["original_shape"][0] - - outputs = self.postprocessor.process_output( - outputs, - scale_x, - scale_y, - self.confidence_threshold, - meta["resized_shape"][:2], - ) - return clip_detections(outputs, meta["original_shape"]) - - -class RetinaFacePostprocessor: - def __init__(self, detect_attributes=False, process_landmarks=True): - self._detect_masks = detect_attributes - self._process_landmarks = process_landmarks - _ratio = (1.0,) - self._anchor_cfg = { - 32: {"SCALES": (32, 16), "BASE_SIZE": 16, "RATIOS": _ratio}, - 16: {"SCALES": (8, 4), "BASE_SIZE": 16, "RATIOS": _ratio}, - 8: {"SCALES": (2, 1), "BASE_SIZE": 16, "RATIOS": _ratio}, - } - self._features_stride_fpn = [32, 16, 8] - self._anchors_fpn = dict( - zip( - self._features_stride_fpn, - self.generate_anchors_fpn(cfg=self._anchor_cfg), - ) - ) - self._num_anchors = dict( - zip( - self._features_stride_fpn, - [anchors.shape[0] for anchors in self._anchors_fpn.values()], - ) - ) - self.landmark_std = 0.2 if detect_attributes else 1.0 - self.nms_threshold = 0.5 if process_landmarks else 0.3 - - @staticmethod - def generate_anchors_fpn(cfg): - def generate_anchors( - base_size=16, ratios=(0.5, 1, 2), scales=2 ** np.arange(3, 6) - ): - base_anchor = np.array([1, 1, base_size, base_size]) - 1 - ratio_anchors = _ratio_enum(base_anchor, ratios) - anchors = np.vstack( - [ - _scale_enum(ratio_anchors[i, :], scales) - for i in range(ratio_anchors.shape[0]) - ] - ) - return anchors - - def _ratio_enum(anchor, ratios): - w, h, x_ctr, y_ctr = _generate_wh_ctrs(anchor) - size = w * h - size_ratios = size / ratios - ws = np.round(np.sqrt(size_ratios)) - hs = np.round(ws * ratios) - anchors = _make_anchors(ws, hs, x_ctr, y_ctr) - return anchors - - def _scale_enum(anchor, scales): - w, h, x_ctr, y_ctr = _generate_wh_ctrs(anchor) - ws = w * scales - hs = h * scales - anchors = _make_anchors(ws, hs, x_ctr, y_ctr) - return anchors - - def _generate_wh_ctrs(anchor): - w = anchor[2] - anchor[0] + 1 - h = anchor[3] - anchor[1] + 1 - x_ctr = anchor[0] + 0.5 * (w - 1) - y_ctr = anchor[1] + 0.5 * (h - 1) - return w, h, x_ctr, y_ctr - - def _make_anchors(ws, hs, x_ctr, y_ctr): - ws = ws[:, np.newaxis] - hs = hs[:, np.newaxis] - anchors = np.hstack( - ( - x_ctr - 0.5 * (ws - 1), - y_ctr - 0.5 * (hs - 1), - x_ctr + 0.5 * (ws - 1), - y_ctr + 0.5 * (hs - 1), - ) - ) - return anchors - - rpn_feat_stride = [int(k) for k in cfg] - rpn_feat_stride.sort(reverse=True) - anchors = [] - for stride in rpn_feat_stride: - feature_info = cfg[stride] - bs = feature_info["BASE_SIZE"] - __ratios = np.array(feature_info["RATIOS"]) - __scales = np.array(feature_info["SCALES"]) - anchors.append(generate_anchors(bs, __ratios, __scales)) - - return anchors - - def process_output( - self, raw_output, scale_x, scale_y, face_prob_threshold, mask_prob_threshold - ): - bboxes_outputs = [ - raw_output[name][0] for name in raw_output if re.search(".bbox.", name) - ] - bboxes_outputs.sort(key=lambda x: x.shape[1]) - - scores_outputs = [ - raw_output[name][0] for name in raw_output if re.search(".cls.", name) - ] - scores_outputs.sort(key=lambda x: x.shape[1]) - - if self._process_landmarks: - landmarks_outputs = [ - raw_output[name][0] - for name in raw_output - if re.search(".landmark.", name) - ] - landmarks_outputs.sort(key=lambda x: x.shape[1]) - - if self._detect_masks: - type_scores_outputs = [ - raw_output[name][0] for name in raw_output if re.search(".type.", name) - ] - type_scores_outputs.sort(key=lambda x: x.shape[1]) - - proposals_list = [] - scores_list = [] - landmarks_list = [] - mask_scores_list = [] - for idx, s in enumerate(self._features_stride_fpn): - anchor_num = self._num_anchors[s] - scores = self._get_scores(scores_outputs[idx], anchor_num) - bbox_deltas = bboxes_outputs[idx] - height, width = bbox_deltas.shape[1], bbox_deltas.shape[2] - anchors_fpn = self._anchors_fpn[s] - anchors = self.anchors_plane(height, width, int(s), anchors_fpn) - anchors = anchors.reshape((height * width * anchor_num, 4)) - proposals = self._get_proposals(bbox_deltas, anchor_num, anchors) - threshold_mask = scores >= face_prob_threshold - - proposals_list.extend(proposals[threshold_mask, :]) - scores_list.extend(scores[threshold_mask]) - if self._process_landmarks: - landmarks = self._get_landmarks( - landmarks_outputs[idx], anchor_num, anchors - ) - landmarks_list.extend(landmarks[threshold_mask, :]) - if self._detect_masks: - masks = self._get_mask_scores(type_scores_outputs[idx], anchor_num) - mask_scores_list.extend(masks[threshold_mask]) - - if len(scores_list) > 0: - proposals_list = np.array(proposals_list) - scores_list = np.array(scores_list) - landmarks_list = np.array(landmarks_list) - mask_scores_list = np.array(mask_scores_list) - x_mins, y_mins, x_maxs, y_maxs = proposals_list.T - keep = nms( - x_mins, - y_mins, - x_maxs, - y_maxs, - scores_list, - self.nms_threshold, - include_boundaries=not self._process_landmarks, - ) - proposals_list = proposals_list[keep] - scores_list = scores_list[keep] - if self._process_landmarks: - landmarks_list = landmarks_list[keep] - if self._detect_masks: - mask_scores_list = mask_scores_list[keep] - - result = [] - if len(scores_list) != 0: - scores = np.reshape(scores_list, -1) - mask_scores_list = np.reshape(mask_scores_list, -1) - x_mins, y_mins, x_maxs, y_maxs = np.array( - proposals_list - ).T # pylint: disable=E0633 - x_mins /= scale_x - x_maxs /= scale_x - y_mins /= scale_y - y_maxs /= scale_y - - result = [] - if self._process_landmarks: - landmarks_x_coords = ( - np.array(landmarks_list)[:, :, ::2].reshape(len(landmarks_list), -1) - / scale_x - ) - landmarks_y_coords = ( - np.array(landmarks_list)[:, :, 1::2].reshape( - len(landmarks_list), -1 - ) - / scale_y - ) - if self._detect_masks: - for i in range(len(scores_list)): - result.append( - DetectionWithLandmarks( - x_mins[i], - y_mins[i], - x_maxs[i], - y_maxs[i], - scores[i], - 0 if mask_scores_list[i] > mask_prob_threshold else 1, - landmarks_x_coords[i], - landmarks_y_coords[i], - ) - ) - else: - for i in range(len(scores_list)): - result.append( - DetectionWithLandmarks( - x_mins[i], - y_mins[i], - x_maxs[i], - y_maxs[i], - scores[i], - 0, - landmarks_x_coords[i], - landmarks_y_coords[i], - ) - ) - else: - for i in range(len(scores_list)): - result.append( - Detection( - x_mins[i], y_mins[i], x_maxs[i], y_maxs[i], scores[i], 0 - ) - ) - - return result - - def _get_proposals(self, bbox_deltas, anchor_num, anchors): - bbox_deltas = bbox_deltas.transpose((1, 2, 0)) - bbox_pred_len = bbox_deltas.shape[2] // anchor_num - bbox_deltas = bbox_deltas.reshape((-1, bbox_pred_len)) - proposals = self.bbox_pred(anchors, bbox_deltas) - return proposals - - @staticmethod - def _get_scores(scores, anchor_num): - scores = scores[anchor_num:, :, :] - scores = scores.transpose((1, 2, 0)).reshape(-1) - return scores - - @staticmethod - def _get_mask_scores(type_scores, anchor_num): - mask_scores = type_scores[anchor_num * 2 :, :, :] - mask_scores = mask_scores.transpose((1, 2, 0)).reshape(-1) - return mask_scores - - def _get_landmarks(self, landmark_deltas, anchor_num, anchors): - landmark_pred_len = landmark_deltas.shape[0] // anchor_num - landmark_deltas = landmark_deltas.transpose((1, 2, 0)).reshape( - (-1, 5, landmark_pred_len // 5) - ) - landmark_deltas *= self.landmark_std - landmarks = self.landmark_pred(anchors, landmark_deltas) - return landmarks - - @staticmethod - def bbox_pred(boxes, box_deltas): - if boxes.shape[0] == 0: - return np.zeros((0, box_deltas.shape[1])) - - boxes = boxes.astype(float, copy=False) - widths = boxes[:, 2] - boxes[:, 0] + 1.0 - heights = boxes[:, 3] - boxes[:, 1] + 1.0 - ctr_x = boxes[:, 0] + 0.5 * (widths - 1.0) - ctr_y = boxes[:, 1] + 0.5 * (heights - 1.0) - dx = box_deltas[:, 0:1] - dy = box_deltas[:, 1:2] - dw = box_deltas[:, 2:3] - dh = box_deltas[:, 3:4] - pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis] - pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis] - pred_w = np.exp(dw) * widths[:, np.newaxis] - pred_h = np.exp(dh) * heights[:, np.newaxis] - pred_boxes = np.zeros(box_deltas.shape) - pred_boxes[:, 0:1] = pred_ctr_x - 0.5 * (pred_w - 1.0) - pred_boxes[:, 1:2] = pred_ctr_y - 0.5 * (pred_h - 1.0) - pred_boxes[:, 2:3] = pred_ctr_x + 0.5 * (pred_w - 1.0) - pred_boxes[:, 3:4] = pred_ctr_y + 0.5 * (pred_h - 1.0) - - if box_deltas.shape[1] > 4: - pred_boxes[:, 4:] = box_deltas[:, 4:] - - return pred_boxes - - @staticmethod - def anchors_plane(height, width, stride, base_anchors): - num_anchors = base_anchors.shape[0] - all_anchors = np.zeros((height, width, num_anchors, 4)) - for iw in range(width): - sw = iw * stride - for ih in range(height): - sh = ih * stride - for k in range(num_anchors): - all_anchors[ih, iw, k, 0] = base_anchors[k, 0] + sw - all_anchors[ih, iw, k, 1] = base_anchors[k, 1] + sh - all_anchors[ih, iw, k, 2] = base_anchors[k, 2] + sw - all_anchors[ih, iw, k, 3] = base_anchors[k, 3] + sh - - return all_anchors - - @staticmethod - def landmark_pred(boxes, landmark_deltas): - if boxes.shape[0] == 0: - return np.zeros((0, landmark_deltas.shape[1])) - boxes = boxes.astype(float, copy=False) - widths = boxes[:, 2] - boxes[:, 0] + 1.0 - heights = boxes[:, 3] - boxes[:, 1] + 1.0 - ctr_x = boxes[:, 0] + 0.5 * (widths - 1.0) - ctr_y = boxes[:, 1] + 0.5 * (heights - 1.0) - pred = landmark_deltas.copy() - for i in range(5): - pred[:, i, 0] = landmark_deltas[:, i, 0] * widths + ctr_x - pred[:, i, 1] = landmark_deltas[:, i, 1] * heights + ctr_y - - return pred - - -class RetinaFacePyTorchPostprocessor: - def __init__(self, process_landmarks=True): - self._process_landmarks = process_landmarks - self.nms_threshold = 0.5 if process_landmarks else 0.3 - self.variance = [0.1, 0.2] - - def process_output( - self, raw_output, scale_x, scale_y, face_prob_threshold, image_size - ): - bboxes_output = [ - raw_output[name][0] for name in raw_output if re.search(".bbox.", name) - ][0] - - scores_output = [ - raw_output[name][0] for name in raw_output if re.search(".cls.", name) - ][0] - - if self._process_landmarks: - landmarks_output = [ - raw_output[name][0] - for name in raw_output - if re.search(".landmark.", name) - ][0] - - prior_data = self.generate_prior_data(image_size) - proposals = self._get_proposals(bboxes_output, prior_data, image_size) - scores = scores_output[:, 1] - filter_idx = np.where(scores > face_prob_threshold)[0] - proposals = proposals[filter_idx] - scores = scores[filter_idx] - if self._process_landmarks: - landmarks = self._get_landmarks(landmarks_output, prior_data, image_size) - landmarks = landmarks[filter_idx] - - if np.size(scores) > 0: - x_mins, y_mins, x_maxs, y_maxs = proposals.T - keep = nms( - x_mins, - y_mins, - x_maxs, - y_maxs, - scores, - self.nms_threshold, - include_boundaries=not self._process_landmarks, - ) - - proposals = proposals[keep] - scores = scores[keep] - if self._process_landmarks: - landmarks = landmarks[keep] - - result = [] - if np.size(scores) != 0: - scores = np.reshape(scores, -1) - x_mins, y_mins, x_maxs, y_maxs = np.array( - proposals - ).T # pylint: disable=E0633 - x_mins /= scale_x - x_maxs /= scale_x - y_mins /= scale_y - y_maxs /= scale_y - - result = [] - if self._process_landmarks: - landmarks_x_coords = np.array(landmarks)[:, ::2] / scale_x - landmarks_y_coords = np.array(landmarks)[:, 1::2] / scale_y - for x_min, y_min, x_max, y_max, score, landmarks_x, landmarks_y in zip( - x_mins, - y_mins, - x_maxs, - y_maxs, - scores, - landmarks_x_coords, - landmarks_y_coords, - ): - result.append( - DetectionWithLandmarks( - x_min, - y_min, - x_max, - y_max, - score, - 0, - landmarks_x, - landmarks_y, - ) - ) - else: - for x_min, y_min, x_max, y_max, score in zip( - x_mins, y_mins, x_maxs, y_maxs, scores - ): - result.append(Detection(x_min, y_min, x_max, y_max, score, 0)) - - return result - - @staticmethod - def generate_prior_data(image_size): - global_min_sizes = [[16, 32], [64, 128], [256, 512]] - steps = [8, 16, 32] - anchors = [] - feature_maps = [ - [int(np.rint(image_size[0] / step)), int(np.rint(image_size[1] / step))] - for step in steps - ] - for idx, feature_map in enumerate(feature_maps): - min_sizes = global_min_sizes[idx] - for i, j in product(range(feature_map[0]), range(feature_map[1])): - for min_size in min_sizes: - s_kx = min_size / image_size[1] - s_ky = min_size / image_size[0] - dense_cx = [x * steps[idx] / image_size[1] for x in [j + 0.5]] - dense_cy = [y * steps[idx] / image_size[0] for y in [i + 0.5]] - for cy, cx in product(dense_cy, dense_cx): - anchors += [cx, cy, s_kx, s_ky] - - priors = np.array(anchors).reshape((-1, 4)) - return priors - - def _get_proposals(self, raw_boxes, priors, image_size): - proposals = self.decode_boxes(raw_boxes, priors, self.variance) - proposals[:, ::2] = proposals[:, ::2] * image_size[1] - proposals[:, 1::2] = proposals[:, 1::2] * image_size[0] - return proposals - - @staticmethod - def decode_boxes(raw_boxes, priors, variance): - boxes = np.concatenate( - ( - priors[:, :2] + raw_boxes[:, :2] * variance[0] * priors[:, 2:], - priors[:, 2:] * np.exp(raw_boxes[:, 2:] * variance[1]), - ), - 1, - ) - boxes[:, :2] -= boxes[:, 2:] / 2 - boxes[:, 2:] += boxes[:, :2] - return boxes - - def _get_landmarks(self, raw_landmarks, priors, image_size): - landmarks = self.decode_landmarks(raw_landmarks, priors, self.variance) - landmarks[:, ::2] = landmarks[:, ::2] * image_size[1] - landmarks[:, 1::2] = landmarks[:, 1::2] * image_size[0] - return landmarks - - @staticmethod - def decode_landmarks(raw_landmarks, priors, variance): - landmarks = np.concatenate( - ( - priors[:, :2] + raw_landmarks[:, :2] * variance[0] * priors[:, 2:], - priors[:, :2] + raw_landmarks[:, 2:4] * variance[0] * priors[:, 2:], - priors[:, :2] + raw_landmarks[:, 4:6] * variance[0] * priors[:, 2:], - priors[:, :2] + raw_landmarks[:, 6:8] * variance[0] * priors[:, 2:], - priors[:, :2] + raw_landmarks[:, 8:10] * variance[0] * priors[:, 2:], - ), - 1, - ) - return landmarks diff --git a/model_api/python/model_api/models/tokens_bert.py b/model_api/python/model_api/models/tokens_bert.py deleted file mode 100644 index b5b2209a..00000000 --- a/model_api/python/model_api/models/tokens_bert.py +++ /dev/null @@ -1,119 +0,0 @@ -""" - Copyright (c) 2020-2024 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import string -import unicodedata - - -# A class to store context as text, its tokens and embedding vector -class ContextData: - def __init__(self, tokens_id, tokens_se, context=None, emb=None): - self.c_tokens_id = tokens_id - self.c_tokens_se = tokens_se - self.context = context - self.emb = emb - - -class ContextWindow: - def __init__(self, window_len, tokens_id, tokens_se): - self.tokens_id = tokens_id - self.tokens_se = tokens_se - self.window_len = window_len - self.stride = self.window_len // 2 # overlap by half - self.total_len = len(self.tokens_id) - self.s, self.e = 0, min(self.window_len, self.total_len) - - def move(self): - self.s = min(self.s + self.stride, self.total_len) - self.e = min(self.s + self.window_len, self.total_len) - - def is_over(self): - return self.e - self.s < self.stride - - def get_context_data(self, context=None): - return ContextData( - self.tokens_id[self.s : self.e], - self.tokens_se[self.s : self.e], - context=context, - ) - - -# load vocabulary file for encoding -def load_vocab_file(vocab_file_name): - with open(vocab_file_name, "r", encoding="utf-8") as r: - return {t.rstrip("\n"): i for i, t in enumerate(r.readlines())} - - -# split word by vocab items and get tok codes -# iteratively return codes -def encode_by_voc(w, vocab): - # remove mark and control chars - def clean_word(w): - wo = "" # accumulator for output word - for c in unicodedata.normalize("NFD", w): - c_cat = unicodedata.category(c) - # remove mark nonspacing code and controls - if c_cat != "Mn" and c_cat[0] != "C": - wo += c - return wo - - w = clean_word(w) - - res = [] - for s0, e0 in split_to_words(w): - s, e = s0, e0 - tokens = [] - while e > s: - subword = w[s:e] if s == s0 else "##" + w[s:e] - if subword in vocab: - tokens.append(vocab[subword]) - s, e = e, e0 - else: - e -= 1 - if s < e0: - tokens = [vocab["[UNK]"]] - res.extend(tokens) - return res - - -# split big text into words by spaces -# iteratively return words -def split_to_words(text): - prev_is_sep = True # mark initial prev as space to start word from 0 char - for i, c in enumerate(text + " "): - is_punc = c in string.punctuation or unicodedata.category(c)[0] == "P" - cur_is_sep = c.isspace() or is_punc - if prev_is_sep != cur_is_sep: - if prev_is_sep: - start = i - else: - yield start, i - del start - if is_punc: - yield i, i + 1 - prev_is_sep = cur_is_sep - - -# get big text and return list of token id and start-end positions for each id in original texts -def text_to_tokens(text, vocab): - tokens_id = [] - tokens_se = [] - for s, e in split_to_words(text): - for tok in encode_by_voc(text[s:e], vocab): - tokens_id.append(tok) - tokens_se.append((s, e)) - - return tokens_id, tokens_se diff --git a/model_api/python/model_api/models/ultra_lightweight_face_detection.py b/model_api/python/model_api/models/ultra_lightweight_face_detection.py deleted file mode 100644 index 5b10335b..00000000 --- a/model_api/python/model_api/models/ultra_lightweight_face_detection.py +++ /dev/null @@ -1,100 +0,0 @@ -""" - Copyright (c) 2021-2024 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import numpy as np - -from .detection_model import DetectionModel -from .types import NumericalValue -from .utils import Detection, nms - - -class UltraLightweightFaceDetection(DetectionModel): - __model__ = "Ultra_LightWeight_Face_Detection" - - def __init__(self, inference_adapter, configuration=dict(), preload=False): - super().__init__(inference_adapter, configuration, preload) - self._check_io_number(1, 2) - self.labels = ["Face"] - self.bboxes_blob_name, self.scores_blob_name = self._get_outputs() - - def _get_outputs(self): - (bboxes_blob_name, bboxes_layer), ( - scores_blob_name, - scores_layer, - ) = self.outputs.items() - - if bboxes_layer.shape[1] != scores_layer.shape[1]: - self.raise_error( - "Expected the same second dimension for boxes and scores, but got {} and {}".format( - bboxes_layer.shape, scores_layer.shape - ) - ) - - if bboxes_layer.shape[2] == 4: - return bboxes_blob_name, scores_blob_name - elif scores_layer.shape[2] == 4: - return scores_blob_name, bboxes_blob_name - else: - self.raise_error( - "Expected shape [:,:,4] for bboxes output, but got {} and {}".format( - bboxes_layer.shape, scores_layer.shape - ) - ) - - @classmethod - def parameters(cls): - parameters = super().parameters() - parameters.update( - { - "iou_threshold": NumericalValue( - default_value=0.5, - description="Threshold for non-maximum suppression (NMS) intersection over union (IOU) filtering", - ), - } - ) - parameters["resize_type"].update_default_value("standard") - parameters["confidence_threshold"].update_default_value(0.5) - parameters["labels"].update_default_value(["Face"]) - return parameters - - def postprocess(self, outputs, meta): - detections = self._parse_outputs(outputs, meta) - detections = self._resize_detections(detections, meta) - return detections - - def _parse_outputs(self, outputs, meta): - boxes = outputs[self.bboxes_blob_name][0] - scores = outputs[self.scores_blob_name][0] - - score = np.transpose(scores)[1] - - mask = score > self.confidence_threshold - filtered_boxes, filtered_score = boxes[mask, :], score[mask] - - x_mins, y_mins, x_maxs, y_maxs = filtered_boxes.T - - keep = nms(x_mins, y_mins, x_maxs, y_maxs, filtered_score, self.iou_threshold) - - return [ - Detection(*det, 0) - for det in zip( - x_mins[keep], - y_mins[keep], - x_maxs[keep], - y_maxs[keep], - filtered_score[keep], - ) - ]