Adding style transfer functionality into "image_processing_demo" (#3000)

ieliz · akorobeinikov · web-flow · commit d513a3bc2fa1 · 2021-12-27T14:46:21.000+03:00
* trying to add style transfer into image processing demo

* removing trailing whitespace

* remove extra scale

* updating readme and adding picture from original IE sample

* moving text in readme

* removing unnecessary function for type print

* fast-neural-style-mosaic-onnx model added to model list

* fixing helping message, changing picture for README

Co-authored-by: akorobeinikov &lt;aleksei.korobeinikov@intel.com&gt;
diff --git a/demos/common/cpp/models/include/models/style_transfer_model.h b/demos/common/cpp/models/include/models/style_transfer_model.h
@@ -0,0 +1,33 @@
+/*
+// Copyright (C) 2021 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writingb  software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#pragma once
+
+#include "image_model.h"
+
+class StyleTransferModel : public ImageModel {
+public:
+    /// Constructor
+    /// @param modelFileName name of model to load
+    StyleTransferModel(const std::string& modelFileName);
+
+    std::shared_ptr<InternalModelData> preprocess(
+        const InputData& inputData, InferenceEngine::InferRequest::Ptr& request) override;
+    std::unique_ptr<ResultBase> postprocess(InferenceResult& infResult) override;
+
+protected:
+    void prepareInputsOutputs(InferenceEngine::CNNNetwork & cnnNetwork) override;
+};
diff --git a/demos/common/cpp/models/src/style_transfer_model.cpp b/demos/common/cpp/models/src/style_transfer_model.cpp
@@ -0,0 +1,96 @@
+/*
+// Copyright (C) 2021 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+*/
+
+#include "models/style_transfer_model.h"
+
+#include "utils/ocv_common.hpp"
+#include <utils/slog.hpp>
+
+#include <string>
+#include <vector>
+#include <memory>
+
+using namespace InferenceEngine;
+
+StyleTransferModel::StyleTransferModel(const std::string& modelFileName) :
+    ImageModel(modelFileName, false) {
+}
+
+void StyleTransferModel::prepareInputsOutputs(InferenceEngine::CNNNetwork& cnnNetwork) {
+    // --------------------------- Configure input & output ---------------------------------------------
+    // --------------------------- Prepare input blobs --------------------------------------------------
+
+    ICNNNetwork::InputShapes inputShapes = cnnNetwork.getInputShapes();
+    if (inputShapes.size() != 1)
+        throw std::runtime_error("Demo supports topologies only with 1 input");
+    inputsNames.push_back(inputShapes.begin()->first);
+    SizeVector& inSizeVector = inputShapes.begin()->second;
+    if (inSizeVector.size() != 4 || inSizeVector[0] != 1 || inSizeVector[1] != 3)
+        throw std::runtime_error("3-channel 4-dimensional model's input is expected");
+    InputInfo& inputInfo = *cnnNetwork.getInputsInfo().begin()->second;
+    inputInfo.setPrecision(Precision::FP32);
+
+    // --------------------------- Prepare output blobs -----------------------------------------------------
+    const OutputsDataMap& outputInfo = cnnNetwork.getOutputsInfo();
+    if (outputInfo.size() != 1)
+        throw std::runtime_error("Demo supports topologies only with 1 output");
+
+    outputsNames.push_back(outputInfo.begin()->first);
+    Data& data = *outputInfo.begin()->second;
+    data.setPrecision(Precision::FP32);
+    const SizeVector& outSizeVector = data.getTensorDesc().getDims();
+    if (outSizeVector.size() != 4 || outSizeVector[0] != 1 || outSizeVector[1] != 3)
+        throw std::runtime_error("3-channel 4-dimensional model's output is expected");
+
+}
+
+std::shared_ptr<InternalModelData> StyleTransferModel::preprocess(const InputData& inputData, InferenceEngine::InferRequest::Ptr& request) {
+    auto imgData = inputData.asRef<ImageInputData>();
+    auto& img = imgData.inputImage;
+
+    Blob::Ptr minput = request->GetBlob(inputsNames[0]);
+    matToBlob(img, minput);
+    return std::make_shared<InternalImageModelData>(img.cols, img.rows);
+}
+
+std::unique_ptr<ResultBase> StyleTransferModel::postprocess(InferenceResult& infResult) {
+
+    ImageResult* result = new ImageResult;
+    *static_cast<ResultBase*>(result) = static_cast<ResultBase&>(infResult);
+
+    const auto& inputImgSize = infResult.internalModelData->asRef<InternalImageModelData>();
+
+    LockedMemory<const void> outMapped = infResult.getFirstOutputBlob()->rmap();
+    const auto outputData = outMapped.as<float*>();
+
+    const SizeVector& outSizeVector = infResult.getFirstOutputBlob()->getTensorDesc().getDims();
+    size_t outHeight = (int)(outSizeVector[2]);
+    size_t outWidth = (int)(outSizeVector[3]);
+    size_t numOfPixels = outWidth * outHeight;
+
+    std::vector<cv::Mat> imgPlanes;
+    imgPlanes = std::vector<cv::Mat>{
+              cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels * 2])),
+              cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[numOfPixels])),
+              cv::Mat(outHeight, outWidth, CV_32FC1, &(outputData[0]))};
+    cv::Mat resultImg;
+    cv::merge(imgPlanes, resultImg);
+    cv::resize(resultImg, result->resultImage, cv::Size(inputImgSize.inputImgWidth, inputImgSize.inputImgHeight));
+
+    result->resultImage.convertTo(result->resultImage, CV_8UC3);
+
+    return std::unique_ptr<ResultBase>(result);
+}
diff --git a/demos/image_processing_demo/cpp/README.md b/demos/image_processing_demo/cpp/README.md
@@ -5,6 +5,7 @@ This demo processes the image according to the selected type of processing. The
 * `super_resolution`
 * `deblurring`
 * `jpeg_restoration`
+* `style_transfer`
 
 ## Examples
 
@@ -38,6 +39,10 @@ Super resolution:
 
 For this type of image processing user can use flag `-jc`. It allows to perform compression before the inference (usefull when user want to test model on high quality jpeg images).
 
+4. Example for style_transfer:
+
+![](./assets/style_transfer.jpg)
+
 ## How It Works
 
 Before running the demo, user must choose type of processing and model for this processing.\
@@ -51,6 +56,8 @@ For `deblurring` user can use [deblurgan-v2](../../../models/public/deblurgan-v2
 
 For `jpeg_restoration` user can use [fbcnn](../../../models/public/fbcnn/README.md) - flexible blind convolutional neural network for JPEG artifacts removal.
 
+For `style_transfer` user can use [fast-neural-style-mosaic-onnx](../../../models/public/fast-neural-style-mosaic-onnx/README.md) - one of the style transfer models designed to mix the content of an image with the style of another image.
+
 The demo runs inference and shows results for each image captured from an input. Depending on number of inference requests processing simultaneously (-nireq parameter) the pipeline might minimize the time required to process each single image (for nireq 1) or maximizes utilization of the device and overall processing performance.
 
 > **NOTE**: By default, Open Model Zoo demos expect input with BGR channels order. If you trained your model to work with RGB order, you need to manually rearrange the default channels order in the demo application or reconvert your model using the Model Optimizer tool with `--reverse_input_channels` argument specified. For more information about the argument, refer to **When to Reverse Input Channels** section of [Converting a Model Using General Conversion Parameters](https://docs.openvino.ai/latest/openvino_docs_MO_DG_prepare_model_convert_model_Converting_Model.html#general-conversion-parameters).
@@ -68,6 +75,7 @@ This file can be used as a parameter for [Model Downloader](../../../tools/model
 * text-image-super-resolution-0001
 * deblurgan-v2
 * fbcnn
+* fast-neural-style-mosaic-onnx
 
 > **NOTE**: Refer to the tables [Intel's Pre-Trained Models Device Support](../../../models/intel/device_support.md) and [Public Pre-Trained Models Device Support](../../../models/public/device_support.md) for the details on models inference support at different devices.
 
@@ -82,7 +90,7 @@ image_processing_demo_async [OPTION]
 Options:
 
     -h                        Print a usage message.
-    -at "<type>"              Required. Type of the network, either 'sr' for Super Resolution task, 'deblur' for Deblurring, 'jr' for JPEGRestoration.
+    -at "<type>"              Required. Type of the network, either 'sr' for Super Resolution task, 'deblur' for Deblurring, 'jr' for JPEGRestoration, 'style' for Style Transfer.
     -i "<path>"               Required. An input to process. The input must be a single image, a folder of images, video file or camera id.
     -m "<path>"               Required. Path to an .xml file with a trained model.
     -o "<path>"               Optional. Name of the output file(s) to save.
diff --git a/demos/image_processing_demo/cpp/assets/style_transfer.jpg b/demos/image_processing_demo/cpp/assets/style_transfer.jpg
diff --git a/demos/image_processing_demo/cpp/main.cpp b/demos/image_processing_demo/cpp/main.cpp
@@ -39,14 +39,15 @@
 #include <models/super_resolution_model.h>
 #include <models/deblurring_model.h>
 #include <models/jpeg_restoration_model.h>
+#include <models/style_transfer_model.h>
 #include <pipelines/metadata.h>
 #include "visualizer.hpp"
 
 DEFINE_INPUT_FLAGS
 DEFINE_OUTPUT_FLAGS
 
 static const char help_message[] = "Print a usage message.";
-static const char at_message[] = "Required. Type of the network, either 'sr' for Super Resolution task, 'deblur' for Deblurring, 'jr' for JPEGRestoration.";
+static const char at_message[] = "Required. Type of the network, either 'sr' for Super Resolution task, 'deblur' for Deblurring, 'jr' for JPEGRestoration, 'style' for Style Transfer task.";
 static const char model_message[] = "Required. Path to an .xml file with a trained model.";
 static const char target_device_message[] = "Optional. Specify the target device to infer on (the list of available devices is shown below). "
 "Default value is CPU. Use \"-d HETERO:<comma-separated_devices_list>\" format to specify HETERO plugin. "
@@ -145,6 +146,9 @@ std::unique_ptr<ImageModel> getModel(const cv::Size& frameSize, const std::strin
     }
     if (type == "jr") {
         return std::unique_ptr<ImageModel>(new JPEGRestorationModel(FLAGS_m, frameSize, doCompression));
+    }
+	if (type == "style") {
+        return std::unique_ptr<ImageModel>(new StyleTransferModel(FLAGS_m));
     }
     throw std::invalid_argument("No model type or invalid model type (-at) provided: " + FLAGS_at);
 }
diff --git a/demos/image_processing_demo/cpp/models.lst b/demos/image_processing_demo/cpp/models.lst
@@ -6,3 +6,5 @@ text-image-super-resolution-????
 deblurgan-v2
 # For -at jr
 fbcnn
+# For -at style
+fast-neural-style-mosaic-onnx