|
| 1 | +// This example provides a digital recognition based on LeNet-5 and connected component analysis. |
| 2 | +// It makes it possible for OpenCV beginner to run dnn models in real time using only CPU. |
| 3 | +// It can read pictures from the camera in real time to make predictions, and display the recognized digits as overlays on top of the original digits. |
| 4 | +// |
| 5 | +// In order to achieve a better display effect, please write the number on white paper and occupy the entire camera. |
| 6 | +// |
| 7 | +// You can follow the following guide to train LeNet-5 by yourself using the MNIST dataset. |
| 8 | +// https://github.com/intel/caffe/blob/a3d5b022fe026e9092fc7abc7654b1162ab9940d/examples/mnist/readme.md |
| 9 | +// |
| 10 | +// You can also download already trained model directly. |
| 11 | +// https://github.com/zihaomu/opencv_digit_text_recognition_demo/tree/master/src |
| 12 | + |
| 13 | + |
| 14 | +#include <opencv2/imgproc.hpp> |
| 15 | +#include <opencv2/highgui.hpp> |
| 16 | +#include <opencv2/dnn.hpp> |
| 17 | + |
| 18 | +#include <iostream> |
| 19 | +#include <vector> |
| 20 | + |
| 21 | +using namespace cv; |
| 22 | +using namespace cv::dnn; |
| 23 | + |
| 24 | +const char *keys = |
| 25 | + "{ help h | | Print help message. }" |
| 26 | + "{ input i | | Path to input image or video file. Skip this argument to capture frames from a camera.}" |
| 27 | + "{ device | 0 | camera device number. }" |
| 28 | + "{ modelBin | | Path to a binary .caffemodel file contains trained network.}" |
| 29 | + "{ modelTxt | | Path to a .prototxt file contains the model definition of trained network.}" |
| 30 | + "{ width | 640 | Set the width of the camera }" |
| 31 | + "{ height | 480 | Set the height of the camera }" |
| 32 | + "{ thr | 0.7 | Confidence threshold. }"; |
| 33 | + |
| 34 | +// Find best class for the blob (i.e. class with maximal probability) |
| 35 | +static void getMaxClass(const Mat &probBlob, int &classId, double &classProb); |
| 36 | + |
| 37 | +void predictor(Net net, const Mat &roi, int &class_id, double &probability); |
| 38 | + |
| 39 | +int main(int argc, char **argv) |
| 40 | +{ |
| 41 | + // Parse command line arguments. |
| 42 | + CommandLineParser parser(argc, argv, keys); |
| 43 | + |
| 44 | + if (argc == 1 || parser.has("help")) |
| 45 | + { |
| 46 | + parser.printMessage(); |
| 47 | + return 0; |
| 48 | + } |
| 49 | + |
| 50 | + int vWidth = parser.get<int>("width"); |
| 51 | + int vHeight = parser.get<int>("height"); |
| 52 | + float confThreshold = parser.get<float>("thr"); |
| 53 | + std::string modelTxt = parser.get<String>("modelTxt"); |
| 54 | + std::string modelBin = parser.get<String>("modelBin"); |
| 55 | + |
| 56 | + Net net; |
| 57 | + try |
| 58 | + { |
| 59 | + net = readNet(modelTxt, modelBin); |
| 60 | + } |
| 61 | + catch (cv::Exception &ee) |
| 62 | + { |
| 63 | + std::cerr << "Exception: " << ee.what() << std::endl; |
| 64 | + std::cout << "Can't load the network by using the flowing files:" << std::endl; |
| 65 | + std::cout << "modelTxt: " << modelTxt << std::endl; |
| 66 | + std::cout << "modelBin: " << modelBin << std::endl; |
| 67 | + return 1; |
| 68 | + } |
| 69 | + |
| 70 | + const std::string resultWinName = "Please write the number on white paper and occupy the entire camera."; |
| 71 | + const std::string preWinName = "Preprocessing"; |
| 72 | + |
| 73 | + namedWindow(preWinName, WINDOW_AUTOSIZE); |
| 74 | + namedWindow(resultWinName, WINDOW_AUTOSIZE); |
| 75 | + |
| 76 | + Mat labels, stats, centroids; |
| 77 | + Point position; |
| 78 | + |
| 79 | + Rect getRectangle; |
| 80 | + bool ifDrawingBox = false; |
| 81 | + |
| 82 | + int classId = 0; |
| 83 | + double probability = 0; |
| 84 | + |
| 85 | + Rect basicRect = Rect(0, 0, vWidth, vHeight); |
| 86 | + Mat rawImage; |
| 87 | + |
| 88 | + double fps = 0; |
| 89 | + |
| 90 | + // Open a video file or an image file or a camera stream. |
| 91 | + VideoCapture cap; |
| 92 | + if (parser.has("input")) |
| 93 | + cap.open(parser.get<String>("input")); |
| 94 | + else |
| 95 | + cap.open(parser.get<int>("device")); |
| 96 | + |
| 97 | + TickMeter tm; |
| 98 | + |
| 99 | + while (waitKey(1) < 0) |
| 100 | + { |
| 101 | + cap >> rawImage; |
| 102 | + if (rawImage.empty()) |
| 103 | + { |
| 104 | + waitKey(); |
| 105 | + break; |
| 106 | + } |
| 107 | + |
| 108 | + tm.reset(); |
| 109 | + tm.start(); |
| 110 | + |
| 111 | + Mat image = rawImage.clone(); |
| 112 | + // Image preprocessing |
| 113 | + cvtColor(image, image, COLOR_BGR2GRAY); |
| 114 | + GaussianBlur(image, image, Size(3, 3), 2, 2); |
| 115 | + adaptiveThreshold(image, image, 255, ADAPTIVE_THRESH_MEAN_C, THRESH_BINARY, 25, 10); |
| 116 | + bitwise_not(image, image); |
| 117 | + |
| 118 | + Mat element = getStructuringElement(MORPH_RECT, Size(3, 3), Point(-1,-1)); |
| 119 | + dilate(image, image, element, Point(-1,-1), 1); |
| 120 | + // Find connected component |
| 121 | + int nccomps = cv::connectedComponentsWithStats(image, labels, stats, centroids); |
| 122 | + |
| 123 | + for (int i = 1; i < nccomps; i++) |
| 124 | + { |
| 125 | + ifDrawingBox = false; |
| 126 | + |
| 127 | + // Extend the bounding box of connected component for easier recognition |
| 128 | + if (stats.at<int>(i - 1, CC_STAT_AREA) > 80 && stats.at<int>(i - 1, CC_STAT_AREA) < 3000) |
| 129 | + { |
| 130 | + ifDrawingBox = true; |
| 131 | + int left = stats.at<int>(i - 1, CC_STAT_HEIGHT) / 4; |
| 132 | + getRectangle = Rect(stats.at<int>(i - 1, CC_STAT_LEFT) - left, stats.at<int>(i - 1, CC_STAT_TOP) - left, stats.at<int>(i - 1, CC_STAT_WIDTH) + 2 * left, stats.at<int>(i - 1, CC_STAT_HEIGHT) + 2 * left); |
| 133 | + getRectangle &= basicRect; |
| 134 | + } |
| 135 | + |
| 136 | + if (ifDrawingBox && !getRectangle.empty()) |
| 137 | + { |
| 138 | + Mat roi = image(getRectangle); |
| 139 | + predictor(net, roi, classId, probability); |
| 140 | + |
| 141 | + if (probability < confThreshold) |
| 142 | + continue; |
| 143 | + |
| 144 | + rectangle(rawImage, getRectangle, Scalar(128, 255, 128), 2); |
| 145 | + |
| 146 | + position = Point(getRectangle.br().x - 7, getRectangle.br().y + 25); |
| 147 | + putText(rawImage, std::to_string(classId), position, 3, 1.0, Scalar(128, 128, 255), 2); |
| 148 | + } |
| 149 | + } |
| 150 | + |
| 151 | + tm.stop(); |
| 152 | + fps = 1 / tm.getTimeSec(); |
| 153 | + std::string fpsString = format("Inference FPS: %.2f.", fps); |
| 154 | + putText(rawImage, fpsString, Point(5, 20), FONT_HERSHEY_SIMPLEX, 0.6, Scalar(128, 255, 128)); |
| 155 | + |
| 156 | + imshow(resultWinName, rawImage); |
| 157 | + imshow(preWinName, image); |
| 158 | + |
| 159 | + } |
| 160 | + |
| 161 | + return 0; |
| 162 | +} |
| 163 | + |
| 164 | +static void getMaxClass(const Mat &probBlob, int &classId, double &classProb) |
| 165 | +{ |
| 166 | + Mat probMat = probBlob.reshape(1, 1); |
| 167 | + Point classNumber; |
| 168 | + minMaxLoc(probMat, NULL, &classProb, NULL, &classNumber); |
| 169 | + classId = classNumber.x; |
| 170 | +} |
| 171 | + |
| 172 | +void predictor(Net net, const Mat &roi, int &classId, double &probability) |
| 173 | +{ |
| 174 | + Mat pred; |
| 175 | + // Convert Mat to batch of images |
| 176 | + Mat inputBlob = dnn::blobFromImage(roi, 1.0, Size(28, 28)); |
| 177 | + // Set the network input |
| 178 | + net.setInput(inputBlob); |
| 179 | + // Compute output |
| 180 | + pred = net.forward(); |
| 181 | + getMaxClass(pred, classId, probability); |
| 182 | +} |
0 commit comments