Wordspotter implemented

anguelos · anguelos · commit 44d7339febdb · 2016-07-06T16:36:23.000+02:00
Classifier implemented as an independednt class from the worspotter.
-Works both on CPU and GPU at expected top performance of 26ms per sample on a GTX 980Ti
-Simple demo for wordspoter added for cpp
-Interactiuve demo added for python
diff --git a/modules/text/CMakeLists.txt b/modules/text/CMakeLists.txt
@@ -23,3 +23,61 @@ endif()
 if(${Tesseract_FOUND})
   target_link_libraries(opencv_text ${Tesseract_LIBS})
 endif()
+
+#anguelos: Principal source from which adaptation came is the cnn_3dobj module
+find_package(Caffe)
+
+if(Caffe_FOUND)
+  message(STATUS "Caffe:   YES")
+  set(HAVE_CAFFE 1)
+else()
+  message(STATUS "Caffe:   NO")
+endif()
+
+find_package(Protobuf)
+if(Protobuf_FOUND)
+  message(STATUS "Protobuf:   YES")
+  set(HAVE_PROTOBUF 1)
+else()
+  message(STATUS "Protobuf:   NO")
+endif()
+
+find_package(Glog)
+if(Glog_FOUND)
+  message(STATUS "Glog:   YES")
+  set(HAVE_GLOG 1)
+else()
+  message(STATUS "Glog:   NO")
+endif()
+
+if(HAVE_CAFFE)
+message(STATUS "HAVE CAFFE!!!")
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/text_config.hpp.in
+               ${CMAKE_CURRENT_SOURCE_DIR}/include/opencv2/text_config.hpp @ONLY)
+
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+
+if(${Caffe_FOUND})
+
+  include_directories(${Caffe_INCLUDE_DIR})
+  #Anguelos: taken from caffe's cmake
+  find_package(HDF5 COMPONENTS HL REQUIRED)
+  include_directories(SYSTEM ${HDF5_INCLUDE_DIRS} ${HDF5_HL_INCLUDE_DIR})
+  list(APPEND Caffe_LINKER_LIBS ${HDF5_LIBRARIES})
+  find_package(Boost 1.46 REQUIRED COMPONENTS system thread filesystem)
+  include_directories(SYSTEM ${Boost_INCLUDE_DIR})
+  include_directories(SYSTEM /usr/local/cuda-7.5/targets/x86_64-linux/include/)
+  list(APPEND Caffe_LINKER_LIBS ${Boost_LIBRARIES})
+
+endif()
+
+
+if(${Caffe_FOUND})
+  #Anguelos: taken from caffe's cmake
+  target_link_libraries(opencv_text ${Caffe_LIBS} ${Glog_LIBS} ${Protobuf_LIBS} ${HDF5_LIBRARIES} ${Boost_LIBRARIES})
+endif()
+endif()
+
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/text_config.hpp.in
+               ${CMAKE_BINARY_DIR}/text_config.hpp @ONLY)
diff --git a/modules/text/FindCaffe.cmake b/modules/text/FindCaffe.cmake
@@ -0,0 +1,14 @@
+# Caffe package for CNN Triplet training
+unset(Caffe_FOUND)
+
+find_path(Caffe_INCLUDE_DIR NAMES caffe/caffe.hpp caffe/common.hpp caffe/net.hpp caffe/proto/caffe.pb.h caffe/util/io.hpp caffe/vision_layers.hpp
+  HINTS
+  /usr/local/include)
+
+find_library(Caffe_LIBS NAMES caffe
+  HINTS
+  /usr/local/lib)
+
+if(Caffe_LIBS AND Caffe_INCLUDE_DIR)
+    set(Caffe_FOUND 1)
+endif()
diff --git a/modules/text/FindGlog.cmake b/modules/text/FindGlog.cmake
@@ -0,0 +1,10 @@
+#Required for Caffe
+unset(Glog_FOUND)
+
+find_library(Glog_LIBS NAMES glog
+  HINTS
+  /usr/local/lib)
+
+if(Glog_LIBS)
+    set(Glog_FOUND 1)
+endif()
diff --git a/modules/text/FindProtobuf.cmake b/modules/text/FindProtobuf.cmake
@@ -0,0 +1,10 @@
+# Protobuf package required for Caffe 
+unset(Protobuf_FOUND)
+
+find_library(Protobuf_LIBS NAMES protobuf
+  HINTS
+  /usr/local/lib)
+
+if(Protobuf_LIBS)
+    set(Protobuf_FOUND 1)
+endif()
diff --git a/modules/text/include/opencv2/text/ocr.hpp b/modules/text/include/opencv2/text/ocr.hpp
@@ -46,6 +46,10 @@
 
 #include <vector>
 #include <string>
+#include <iostream>
+#include <sstream>
+
+
 
 namespace cv
 {
@@ -465,6 +469,191 @@ CV_EXPORTS_W Ptr<OCRBeamSearchDecoder::ClassifierCallback> loadOCRBeamSearchClas
 
 //! @}
 
+
+
+//Classifiers should provide diferent backends
+//For the moment only caffe is implemeted
+enum{
+    OCR_HOLISTIC_BACKEND_NONE,
+    OCR_HOLISTIC_BACKEND_CAFFE
+};
+
+
+/** @brief Abstract class that implements the classifcation of text images.
+ *
+ * The interface is generic enough to describe any image classifier. And allows
+ * to take advantage of compouting in batches. While word classifiers are the default
+ * networks, any image classifers should work.
+ *
+ */
+class CV_EXPORTS_W TextImageClassifier
+{
+protected:
+    Size inputSz_;
+    int channelCount_;
+    /** @brief all image preprocessing is handled here including whitening etc.
+     *
+     *  @param input the image to be preprocessed for the classifier. If the depth
+     * is CV_U8 values should be in [0,255] otherwise values are assumed to be in [0,1]
+     *
+     * @param output reference to the image to be fed to the classifier, the preprocessor will
+     * resize the image to the apropriate size and convert it to the apropriate depth\
+     *
+     * The method preprocess should never be used externally, it is up to classify and classifyBatch
+     * methods to employ it.
+     */
+    virtual void preprocess(Mat& input,Mat& output)=0;
+public:
+    virtual ~TextImageClassifier() {}
+    /** @brief produces a class confidence row-vector given an image
+     */
+    CV_WRAP virtual void classify(InputArray image, OutputArray classProbabilities) = 0;
+    /** @brief produces a matrix containing class confidence row-vectors given an collection of images
+     */
+    CV_WRAP virtual void classifyBatch(InputArrayOfArrays image, OutputArray classProbabilities) = 0;
+    /** @brief simple getter method returning the size of the oputput row-vector
+     */
+    CV_WRAP virtual int getOutputSize()=0;
+    /** @brief simple getter method returning the size of the minibatches for this classifier.
+     * If not applicabe this method should return 1
+     */
+    CV_WRAP virtual int getMinibatchSize()=0;
+    /** @brief simple getter method returning a value describing the framework beeing employed to implement the classifier
+     */
+    CV_WRAP virtual int getBackend(){return OCR_HOLISTIC_BACKEND_NONE;}
+};
+
+class CV_EXPORTS_W DictNet:public TextImageClassifier
+{
+    /** @brief Class that uses a pretrained caffe model for word classification.
+     *
+     * This network is described in detail in:
+     * Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015
+     * http://arxiv.org/abs/1412.1842
+     */
+public:
+    virtual ~DictNet() {};
+
+    CV_WRAP virtual bool usingGpu()=0;
+    /** @brief Constructs a DictNet object from a caffe pretrained model
+     *
+     * @param archFilename is the path to the prototxt file containing the deployment model architecture description.
+     *
+     * @param weightsFilename is the path to the pretrained weights of the model in binary fdorm. This file can be
+     * very large, up to 2GB.
+     *
+     * @param minibatchSz the maximum number of samples that can processed in parallel. In practice this parameter
+     * has an effect only when computing in the GPU and should be set with respect to the memory available in the GPU.
+     *
+     * @param useGpu boolean flag setting GPU or CPU computation
+     *
+     * @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is
+     * the only option
+     */
+    CV_WRAP static Ptr<DictNet> create(String archFilename,String weightsFilename,int minibatchSz=100,bool useGpu=0,int backEnd=OCR_HOLISTIC_BACKEND_CAFFE);
+};
+
+
+
+/** @brief OCRHolisticWordRecognizer class provides the functionallity of segmented wordspotting.
+ * Given a predefined vocabulary , a TextImageClassifier is employed to select the most probable
+ * word given an input image.
+ *
+ * This class implements the logic of providing transcriptions given a vocabulary and and an image
+ * classifer.
+ */
+class CV_EXPORTS_W OCRHolisticWordRecognizer : public BaseOCR
+{
+public:
+    /** @brief Recognize text using a segmentation based word-spotting/classifier cnn.
+
+    Takes image on input and returns recognized text in the output_text parameter. Optionally
+    provides also the Rects for individual text elements found (e.g. words), and the list of those
+    text elements with their confidence values.
+
+    @param image Input image CV_8UC1 or CV_8UC3
+
+    @param output_text Output text of the the word spoting, always one that exists in the dictionary.
+
+    @param component_rects Not applicable for word spotting can be be NULL if not, a single elemnt will
+        be put in the vector.
+
+    @param component_texts Not applicable for word spotting can be be NULL if not, a single elemnt will
+        be put in the vector.
+
+    @param component_confidences Not applicable for word spotting can be be NULL if not, a single elemnt will
+        be put in the vector.
+
+    @param component_level must be OCR_LEVEL_WORD.
+
+    @param mask is totally ignored and is only available for compatibillity reasons
+     */
+
+    virtual void run(Mat& image, std::string& output_text, std::vector<Rect>* component_rects=NULL,
+                     std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
+                     int component_level=OCR_LEVEL_WORD)=0;
+
+    virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
+                     std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
+                     int component_level=OCR_LEVEL_WORD)=0;
+
+
+    /**
+    @brief Method that provides a quick and simple interface to a single word image classifcation
+
+    @param inputImage an image expected to be a CV_U8C1 or CV_U8C3 of any size assumed to contain a single word
+
+    @param transcription an opencv string that will store the detected word transcription
+
+    @param confidence a double that will be updated with the confidence the classifier has for the selected word
+    */
+    CV_WRAP virtual void recogniseImage(InputArray inputImage,CV_OUT String& transcription,CV_OUT double& confidence)=0;
+
+    /**
+    @brief Method that provides a quick and simple interface to a multiple word image classifcation taking advantage
+    the classifiers parallel capabilities.
+
+    @param inputImage an list of images expected to be a CV_U8C1 or CV_U8C3 each image can be of any size and is assumed
+    to contain a single word.
+
+    @param transcriptions a vector of opencv strings that will store the detected word transcriptions, one for each
+    input image
+
+    @param confidences a vector of double that will be updated with the confidence the classifier has for each of the
+    selected words.
+    */
+    CV_WRAP virtual void recogniseImageBatch(InputArrayOfArrays inputImageList,CV_OUT std::vector<String>& transcriptions,CV_OUT std::vector<double>& confidences)=0;
+
+
+    /**
+    @brief simple getted for the vocabulary employed
+    */
+    CV_WRAP virtual const std::vector<String>& getVocabulary()=0;
+
+
+    /** @brief Creates an instance of the OCRHolisticWordRecognizer class.
+
+    @param classifierPtr an instance of TextImageClassifier, normaly a DictNet instance
+    @param vocabullaryFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line
+    in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize
+    of the classifier.
+     */
+    CV_WRAP static Ptr<OCRHolisticWordRecognizer> create(Ptr<TextImageClassifier> classifierPtr,String vocabullaryFilename);
+    /** @brief Creates an instance of the OCRHolisticWordRecognizer class and implicitly also a DictNet classifier.
+
+    @param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture.
+    @param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form.
+    @param vocabullaryFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line
+    in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize
+    of the classifier.
+    */
+    CV_WRAP static Ptr<OCRHolisticWordRecognizer> create(String modelArchFilename, String modelWeightsFilename, String vocabullaryFilename);
+
+};
+
+
 }
 }
+
+
 #endif // _OPENCV_TEXT_OCR_HPP_
diff --git a/modules/text/samples/dictnet_demo.cpp b/modules/text/samples/dictnet_demo.cpp
@@ -0,0 +1,90 @@
+/*
+ * dictnet_demo.cpp
+ *
+ * Demonstrates simple use of the holistic word classifier in C++
+ *
+ * Created on: June 26, 2016
+ *     Author: Anguelos Nicolaou <anguelos.nicolaou AT gmail.com>
+ */
+
+#include  "opencv2/text.hpp"
+#include  "opencv2/highgui.hpp"
+#include  "opencv2/imgproc.hpp"
+
+#include  <sstream>
+#include  <vector>
+#include  <iostream>
+#include  <iomanip>
+#include  <fstream>
+
+std::string getHelpStr(std::string progFname){
+    std::stringstream out;
+    out << "    Demo of wordspotting CNN for text recognition." << std::endl;
+    out << "    Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015"<<std::endl<<std::endl;
+
+    out << "    Usage: " << progFname << " <output_file> <input_image1> <input_image2> ... <input_imageN>" << std::endl;
+    out << "    Caffe Model files  (dictnet_vgg.caffemodel, dictnet_vgg_deploy.prototxt, dictnet_vgg_labels.txt)"<<std::endl;
+    out << "      must be in the current directory." << std::endl << std::endl;
+
+    out << "    Obtaining Caffe Model files in linux shell:"<<std::endl;
+    out << "    wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg.caffemodel"<<std::endl;
+    out << "    wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_deploy.prototxt"<<std::endl;
+    out << "    wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_labels.txt"<<std::endl<<std::endl;
+    return out.str();
+}
+
+inline bool fileExists (std::string filename) {
+    std::ifstream f(filename.c_str());
+    return f.good();
+}
+
+
+int main(int argc, const char * argv[]){
+    const int USE_GPU=0;
+
+    if (argc < 3){
+        std::cout<<getHelpStr(argv[0]);
+        exit(1);
+        std::cout<<"Insufiecient parameters. Aborting!"<<std::endl;
+    }
+
+    if (!fileExists("dictnet_vgg.caffemodel") ||
+            !fileExists("dictnet_vgg_deploy.prototxt") ||
+            !fileExists("dictnet_vgg_labels.txt")){
+        std::cout<<getHelpStr(argv[0]);
+        std::cout<<"Model files not found in the current directory. Aborting!"<<std::endl;
+        exit(1);
+    }
+
+    if (fileExists(argv[1])){
+        std::cout<<getHelpStr(argv[0]);
+        std::cout<<"Output file must not exist. Aborting!"<<std::endl;
+        exit(1);
+    }
+
+    std::vector<cv::Mat> imageList;
+    for(int imageIdx=2;imageIdx<argc;imageIdx++){
+        if (fileExists(argv[imageIdx])){
+            imageList.push_back(cv::imread(cv::String(argv[imageIdx])));
+        }else{
+            std::cout<<getHelpStr(argv[0]);
+            std::cout<<argv[imageIdx]<<" doesn't exist. Aborting";
+        }
+    }
+    cv::Ptr<cv::text::DictNet> cnn=cv::text::DictNet::create(
+                "dictnet_vgg_deploy.prototxt","dictnet_vgg.caffemodel",100,USE_GPU);
+
+    cv::Ptr<cv::text::OCRHolisticWordRecognizer> wordSpotter=
+            cv::text::OCRHolisticWordRecognizer::create(cnn,"dictnet_vgg_labels.txt");
+
+    std::vector<cv::String> wordList;
+    std::vector<double> outProbabillities;
+    wordSpotter->recogniseImageBatch(imageList,wordList,outProbabillities);
+
+    std::ofstream out;
+    out.open(argv[1]);
+    for(int imgIdx=0;imgIdx<imageList.size();imgIdx++){
+        out<<argv[imgIdx+2]<<","<<wordList[imgIdx]<<","<<outProbabillities[imgIdx]<<std::endl;
+    }
+    out.close();
+}
diff --git a/modules/text/samples/dictnet_demo.py b/modules/text/samples/dictnet_demo.py
diff --git a/modules/text/src/ocr_holistic.cpp b/modules/text/src/ocr_holistic.cpp
diff --git a/modules/text/text_config.hpp.in b/modules/text/text_config.hpp.in