Skip to content

Commit 44d7339

Browse files
committed
Wordspotter implemented
Classifier implemented as an independednt class from the worspotter. -Works both on CPU and GPU at expected top performance of 26ms per sample on a GTX 980Ti -Simple demo for wordspoter added for cpp -Interactiuve demo added for python
1 parent 51a4f6e commit 44d7339

File tree

9 files changed

+747
-1
lines changed

9 files changed

+747
-1
lines changed

modules/text/CMakeLists.txt

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,61 @@ endif()
2323
if(${Tesseract_FOUND})
2424
target_link_libraries(opencv_text ${Tesseract_LIBS})
2525
endif()
26+
27+
#anguelos: Principal source from which adaptation came is the cnn_3dobj module
28+
find_package(Caffe)
29+
30+
if(Caffe_FOUND)
31+
message(STATUS "Caffe: YES")
32+
set(HAVE_CAFFE 1)
33+
else()
34+
message(STATUS "Caffe: NO")
35+
endif()
36+
37+
find_package(Protobuf)
38+
if(Protobuf_FOUND)
39+
message(STATUS "Protobuf: YES")
40+
set(HAVE_PROTOBUF 1)
41+
else()
42+
message(STATUS "Protobuf: NO")
43+
endif()
44+
45+
find_package(Glog)
46+
if(Glog_FOUND)
47+
message(STATUS "Glog: YES")
48+
set(HAVE_GLOG 1)
49+
else()
50+
message(STATUS "Glog: NO")
51+
endif()
52+
53+
if(HAVE_CAFFE)
54+
message(STATUS "HAVE CAFFE!!!")
55+
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/text_config.hpp.in
56+
${CMAKE_CURRENT_SOURCE_DIR}/include/opencv2/text_config.hpp @ONLY)
57+
58+
59+
include_directories(${CMAKE_CURRENT_BINARY_DIR})
60+
61+
if(${Caffe_FOUND})
62+
63+
include_directories(${Caffe_INCLUDE_DIR})
64+
#Anguelos: taken from caffe's cmake
65+
find_package(HDF5 COMPONENTS HL REQUIRED)
66+
include_directories(SYSTEM ${HDF5_INCLUDE_DIRS} ${HDF5_HL_INCLUDE_DIR})
67+
list(APPEND Caffe_LINKER_LIBS ${HDF5_LIBRARIES})
68+
find_package(Boost 1.46 REQUIRED COMPONENTS system thread filesystem)
69+
include_directories(SYSTEM ${Boost_INCLUDE_DIR})
70+
include_directories(SYSTEM /usr/local/cuda-7.5/targets/x86_64-linux/include/)
71+
list(APPEND Caffe_LINKER_LIBS ${Boost_LIBRARIES})
72+
73+
endif()
74+
75+
76+
if(${Caffe_FOUND})
77+
#Anguelos: taken from caffe's cmake
78+
target_link_libraries(opencv_text ${Caffe_LIBS} ${Glog_LIBS} ${Protobuf_LIBS} ${HDF5_LIBRARIES} ${Boost_LIBRARIES})
79+
endif()
80+
endif()
81+
82+
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/text_config.hpp.in
83+
${CMAKE_BINARY_DIR}/text_config.hpp @ONLY)

modules/text/FindCaffe.cmake

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Caffe package for CNN Triplet training
2+
unset(Caffe_FOUND)
3+
4+
find_path(Caffe_INCLUDE_DIR NAMES caffe/caffe.hpp caffe/common.hpp caffe/net.hpp caffe/proto/caffe.pb.h caffe/util/io.hpp caffe/vision_layers.hpp
5+
HINTS
6+
/usr/local/include)
7+
8+
find_library(Caffe_LIBS NAMES caffe
9+
HINTS
10+
/usr/local/lib)
11+
12+
if(Caffe_LIBS AND Caffe_INCLUDE_DIR)
13+
set(Caffe_FOUND 1)
14+
endif()

modules/text/FindGlog.cmake

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#Required for Caffe
2+
unset(Glog_FOUND)
3+
4+
find_library(Glog_LIBS NAMES glog
5+
HINTS
6+
/usr/local/lib)
7+
8+
if(Glog_LIBS)
9+
set(Glog_FOUND 1)
10+
endif()

modules/text/FindProtobuf.cmake

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Protobuf package required for Caffe
2+
unset(Protobuf_FOUND)
3+
4+
find_library(Protobuf_LIBS NAMES protobuf
5+
HINTS
6+
/usr/local/lib)
7+
8+
if(Protobuf_LIBS)
9+
set(Protobuf_FOUND 1)
10+
endif()

modules/text/include/opencv2/text/ocr.hpp

Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,10 @@
4646

4747
#include <vector>
4848
#include <string>
49+
#include <iostream>
50+
#include <sstream>
51+
52+
4953

5054
namespace cv
5155
{
@@ -465,6 +469,191 @@ CV_EXPORTS_W Ptr<OCRBeamSearchDecoder::ClassifierCallback> loadOCRBeamSearchClas
465469

466470
//! @}
467471

472+
473+
474+
//Classifiers should provide diferent backends
475+
//For the moment only caffe is implemeted
476+
enum{
477+
OCR_HOLISTIC_BACKEND_NONE,
478+
OCR_HOLISTIC_BACKEND_CAFFE
479+
};
480+
481+
482+
/** @brief Abstract class that implements the classifcation of text images.
483+
*
484+
* The interface is generic enough to describe any image classifier. And allows
485+
* to take advantage of compouting in batches. While word classifiers are the default
486+
* networks, any image classifers should work.
487+
*
488+
*/
489+
class CV_EXPORTS_W TextImageClassifier
490+
{
491+
protected:
492+
Size inputSz_;
493+
int channelCount_;
494+
/** @brief all image preprocessing is handled here including whitening etc.
495+
*
496+
* @param input the image to be preprocessed for the classifier. If the depth
497+
* is CV_U8 values should be in [0,255] otherwise values are assumed to be in [0,1]
498+
*
499+
* @param output reference to the image to be fed to the classifier, the preprocessor will
500+
* resize the image to the apropriate size and convert it to the apropriate depth\
501+
*
502+
* The method preprocess should never be used externally, it is up to classify and classifyBatch
503+
* methods to employ it.
504+
*/
505+
virtual void preprocess(Mat& input,Mat& output)=0;
506+
public:
507+
virtual ~TextImageClassifier() {}
508+
/** @brief produces a class confidence row-vector given an image
509+
*/
510+
CV_WRAP virtual void classify(InputArray image, OutputArray classProbabilities) = 0;
511+
/** @brief produces a matrix containing class confidence row-vectors given an collection of images
512+
*/
513+
CV_WRAP virtual void classifyBatch(InputArrayOfArrays image, OutputArray classProbabilities) = 0;
514+
/** @brief simple getter method returning the size of the oputput row-vector
515+
*/
516+
CV_WRAP virtual int getOutputSize()=0;
517+
/** @brief simple getter method returning the size of the minibatches for this classifier.
518+
* If not applicabe this method should return 1
519+
*/
520+
CV_WRAP virtual int getMinibatchSize()=0;
521+
/** @brief simple getter method returning a value describing the framework beeing employed to implement the classifier
522+
*/
523+
CV_WRAP virtual int getBackend(){return OCR_HOLISTIC_BACKEND_NONE;}
524+
};
525+
526+
class CV_EXPORTS_W DictNet:public TextImageClassifier
527+
{
528+
/** @brief Class that uses a pretrained caffe model for word classification.
529+
*
530+
* This network is described in detail in:
531+
* Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015
532+
* http://arxiv.org/abs/1412.1842
533+
*/
534+
public:
535+
virtual ~DictNet() {};
536+
537+
CV_WRAP virtual bool usingGpu()=0;
538+
/** @brief Constructs a DictNet object from a caffe pretrained model
539+
*
540+
* @param archFilename is the path to the prototxt file containing the deployment model architecture description.
541+
*
542+
* @param weightsFilename is the path to the pretrained weights of the model in binary fdorm. This file can be
543+
* very large, up to 2GB.
544+
*
545+
* @param minibatchSz the maximum number of samples that can processed in parallel. In practice this parameter
546+
* has an effect only when computing in the GPU and should be set with respect to the memory available in the GPU.
547+
*
548+
* @param useGpu boolean flag setting GPU or CPU computation
549+
*
550+
* @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is
551+
* the only option
552+
*/
553+
CV_WRAP static Ptr<DictNet> create(String archFilename,String weightsFilename,int minibatchSz=100,bool useGpu=0,int backEnd=OCR_HOLISTIC_BACKEND_CAFFE);
554+
};
555+
556+
557+
558+
/** @brief OCRHolisticWordRecognizer class provides the functionallity of segmented wordspotting.
559+
* Given a predefined vocabulary , a TextImageClassifier is employed to select the most probable
560+
* word given an input image.
561+
*
562+
* This class implements the logic of providing transcriptions given a vocabulary and and an image
563+
* classifer.
564+
*/
565+
class CV_EXPORTS_W OCRHolisticWordRecognizer : public BaseOCR
566+
{
567+
public:
568+
/** @brief Recognize text using a segmentation based word-spotting/classifier cnn.
569+
570+
Takes image on input and returns recognized text in the output_text parameter. Optionally
571+
provides also the Rects for individual text elements found (e.g. words), and the list of those
572+
text elements with their confidence values.
573+
574+
@param image Input image CV_8UC1 or CV_8UC3
575+
576+
@param output_text Output text of the the word spoting, always one that exists in the dictionary.
577+
578+
@param component_rects Not applicable for word spotting can be be NULL if not, a single elemnt will
579+
be put in the vector.
580+
581+
@param component_texts Not applicable for word spotting can be be NULL if not, a single elemnt will
582+
be put in the vector.
583+
584+
@param component_confidences Not applicable for word spotting can be be NULL if not, a single elemnt will
585+
be put in the vector.
586+
587+
@param component_level must be OCR_LEVEL_WORD.
588+
589+
@param mask is totally ignored and is only available for compatibillity reasons
590+
*/
591+
592+
virtual void run(Mat& image, std::string& output_text, std::vector<Rect>* component_rects=NULL,
593+
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
594+
int component_level=OCR_LEVEL_WORD)=0;
595+
596+
virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL,
597+
std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL,
598+
int component_level=OCR_LEVEL_WORD)=0;
599+
600+
601+
/**
602+
@brief Method that provides a quick and simple interface to a single word image classifcation
603+
604+
@param inputImage an image expected to be a CV_U8C1 or CV_U8C3 of any size assumed to contain a single word
605+
606+
@param transcription an opencv string that will store the detected word transcription
607+
608+
@param confidence a double that will be updated with the confidence the classifier has for the selected word
609+
*/
610+
CV_WRAP virtual void recogniseImage(InputArray inputImage,CV_OUT String& transcription,CV_OUT double& confidence)=0;
611+
612+
/**
613+
@brief Method that provides a quick and simple interface to a multiple word image classifcation taking advantage
614+
the classifiers parallel capabilities.
615+
616+
@param inputImage an list of images expected to be a CV_U8C1 or CV_U8C3 each image can be of any size and is assumed
617+
to contain a single word.
618+
619+
@param transcriptions a vector of opencv strings that will store the detected word transcriptions, one for each
620+
input image
621+
622+
@param confidences a vector of double that will be updated with the confidence the classifier has for each of the
623+
selected words.
624+
*/
625+
CV_WRAP virtual void recogniseImageBatch(InputArrayOfArrays inputImageList,CV_OUT std::vector<String>& transcriptions,CV_OUT std::vector<double>& confidences)=0;
626+
627+
628+
/**
629+
@brief simple getted for the vocabulary employed
630+
*/
631+
CV_WRAP virtual const std::vector<String>& getVocabulary()=0;
632+
633+
634+
/** @brief Creates an instance of the OCRHolisticWordRecognizer class.
635+
636+
@param classifierPtr an instance of TextImageClassifier, normaly a DictNet instance
637+
@param vocabullaryFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line
638+
in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize
639+
of the classifier.
640+
*/
641+
CV_WRAP static Ptr<OCRHolisticWordRecognizer> create(Ptr<TextImageClassifier> classifierPtr,String vocabullaryFilename);
642+
/** @brief Creates an instance of the OCRHolisticWordRecognizer class and implicitly also a DictNet classifier.
643+
644+
@param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture.
645+
@param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form.
646+
@param vocabullaryFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line
647+
in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize
648+
of the classifier.
649+
*/
650+
CV_WRAP static Ptr<OCRHolisticWordRecognizer> create(String modelArchFilename, String modelWeightsFilename, String vocabullaryFilename);
651+
652+
};
653+
654+
468655
}
469656
}
657+
658+
470659
#endif // _OPENCV_TEXT_OCR_HPP_

modules/text/samples/dictnet_demo.cpp

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
/*
2+
* dictnet_demo.cpp
3+
*
4+
* Demonstrates simple use of the holistic word classifier in C++
5+
*
6+
* Created on: June 26, 2016
7+
* Author: Anguelos Nicolaou <anguelos.nicolaou AT gmail.com>
8+
*/
9+
10+
#include "opencv2/text.hpp"
11+
#include "opencv2/highgui.hpp"
12+
#include "opencv2/imgproc.hpp"
13+
14+
#include <sstream>
15+
#include <vector>
16+
#include <iostream>
17+
#include <iomanip>
18+
#include <fstream>
19+
20+
std::string getHelpStr(std::string progFname){
21+
std::stringstream out;
22+
out << " Demo of wordspotting CNN for text recognition." << std::endl;
23+
out << " Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015"<<std::endl<<std::endl;
24+
25+
out << " Usage: " << progFname << " <output_file> <input_image1> <input_image2> ... <input_imageN>" << std::endl;
26+
out << " Caffe Model files (dictnet_vgg.caffemodel, dictnet_vgg_deploy.prototxt, dictnet_vgg_labels.txt)"<<std::endl;
27+
out << " must be in the current directory." << std::endl << std::endl;
28+
29+
out << " Obtaining Caffe Model files in linux shell:"<<std::endl;
30+
out << " wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg.caffemodel"<<std::endl;
31+
out << " wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_deploy.prototxt"<<std::endl;
32+
out << " wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_labels.txt"<<std::endl<<std::endl;
33+
return out.str();
34+
}
35+
36+
inline bool fileExists (std::string filename) {
37+
std::ifstream f(filename.c_str());
38+
return f.good();
39+
}
40+
41+
42+
int main(int argc, const char * argv[]){
43+
const int USE_GPU=0;
44+
45+
if (argc < 3){
46+
std::cout<<getHelpStr(argv[0]);
47+
exit(1);
48+
std::cout<<"Insufiecient parameters. Aborting!"<<std::endl;
49+
}
50+
51+
if (!fileExists("dictnet_vgg.caffemodel") ||
52+
!fileExists("dictnet_vgg_deploy.prototxt") ||
53+
!fileExists("dictnet_vgg_labels.txt")){
54+
std::cout<<getHelpStr(argv[0]);
55+
std::cout<<"Model files not found in the current directory. Aborting!"<<std::endl;
56+
exit(1);
57+
}
58+
59+
if (fileExists(argv[1])){
60+
std::cout<<getHelpStr(argv[0]);
61+
std::cout<<"Output file must not exist. Aborting!"<<std::endl;
62+
exit(1);
63+
}
64+
65+
std::vector<cv::Mat> imageList;
66+
for(int imageIdx=2;imageIdx<argc;imageIdx++){
67+
if (fileExists(argv[imageIdx])){
68+
imageList.push_back(cv::imread(cv::String(argv[imageIdx])));
69+
}else{
70+
std::cout<<getHelpStr(argv[0]);
71+
std::cout<<argv[imageIdx]<<" doesn't exist. Aborting";
72+
}
73+
}
74+
cv::Ptr<cv::text::DictNet> cnn=cv::text::DictNet::create(
75+
"dictnet_vgg_deploy.prototxt","dictnet_vgg.caffemodel",100,USE_GPU);
76+
77+
cv::Ptr<cv::text::OCRHolisticWordRecognizer> wordSpotter=
78+
cv::text::OCRHolisticWordRecognizer::create(cnn,"dictnet_vgg_labels.txt");
79+
80+
std::vector<cv::String> wordList;
81+
std::vector<double> outProbabillities;
82+
wordSpotter->recogniseImageBatch(imageList,wordList,outProbabillities);
83+
84+
std::ofstream out;
85+
out.open(argv[1]);
86+
for(int imgIdx=0;imgIdx<imageList.size();imgIdx++){
87+
out<<argv[imgIdx+2]<<","<<wordList[imgIdx]<<","<<outProbabillities[imgIdx]<<std::endl;
88+
}
89+
out.close();
90+
}

0 commit comments

Comments
 (0)