|
46 | 46 |
|
47 | 47 | #include <vector>
|
48 | 48 | #include <string>
|
| 49 | +#include <iostream> |
| 50 | +#include <sstream> |
| 51 | + |
| 52 | + |
49 | 53 |
|
50 | 54 | namespace cv
|
51 | 55 | {
|
@@ -465,6 +469,191 @@ CV_EXPORTS_W Ptr<OCRBeamSearchDecoder::ClassifierCallback> loadOCRBeamSearchClas
|
465 | 469 |
|
466 | 470 | //! @}
|
467 | 471 |
|
| 472 | + |
| 473 | + |
| 474 | +//Classifiers should provide diferent backends |
| 475 | +//For the moment only caffe is implemeted |
| 476 | +enum{ |
| 477 | + OCR_HOLISTIC_BACKEND_NONE, |
| 478 | + OCR_HOLISTIC_BACKEND_CAFFE |
| 479 | +}; |
| 480 | + |
| 481 | + |
| 482 | +/** @brief Abstract class that implements the classifcation of text images. |
| 483 | + * |
| 484 | + * The interface is generic enough to describe any image classifier. And allows |
| 485 | + * to take advantage of compouting in batches. While word classifiers are the default |
| 486 | + * networks, any image classifers should work. |
| 487 | + * |
| 488 | + */ |
| 489 | +class CV_EXPORTS_W TextImageClassifier |
| 490 | +{ |
| 491 | +protected: |
| 492 | + Size inputSz_; |
| 493 | + int channelCount_; |
| 494 | + /** @brief all image preprocessing is handled here including whitening etc. |
| 495 | + * |
| 496 | + * @param input the image to be preprocessed for the classifier. If the depth |
| 497 | + * is CV_U8 values should be in [0,255] otherwise values are assumed to be in [0,1] |
| 498 | + * |
| 499 | + * @param output reference to the image to be fed to the classifier, the preprocessor will |
| 500 | + * resize the image to the apropriate size and convert it to the apropriate depth\ |
| 501 | + * |
| 502 | + * The method preprocess should never be used externally, it is up to classify and classifyBatch |
| 503 | + * methods to employ it. |
| 504 | + */ |
| 505 | + virtual void preprocess(Mat& input,Mat& output)=0; |
| 506 | +public: |
| 507 | + virtual ~TextImageClassifier() {} |
| 508 | + /** @brief produces a class confidence row-vector given an image |
| 509 | + */ |
| 510 | + CV_WRAP virtual void classify(InputArray image, OutputArray classProbabilities) = 0; |
| 511 | + /** @brief produces a matrix containing class confidence row-vectors given an collection of images |
| 512 | + */ |
| 513 | + CV_WRAP virtual void classifyBatch(InputArrayOfArrays image, OutputArray classProbabilities) = 0; |
| 514 | + /** @brief simple getter method returning the size of the oputput row-vector |
| 515 | + */ |
| 516 | + CV_WRAP virtual int getOutputSize()=0; |
| 517 | + /** @brief simple getter method returning the size of the minibatches for this classifier. |
| 518 | + * If not applicabe this method should return 1 |
| 519 | + */ |
| 520 | + CV_WRAP virtual int getMinibatchSize()=0; |
| 521 | + /** @brief simple getter method returning a value describing the framework beeing employed to implement the classifier |
| 522 | + */ |
| 523 | + CV_WRAP virtual int getBackend(){return OCR_HOLISTIC_BACKEND_NONE;} |
| 524 | +}; |
| 525 | + |
| 526 | +class CV_EXPORTS_W DictNet:public TextImageClassifier |
| 527 | +{ |
| 528 | + /** @brief Class that uses a pretrained caffe model for word classification. |
| 529 | + * |
| 530 | + * This network is described in detail in: |
| 531 | + * Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015 |
| 532 | + * http://arxiv.org/abs/1412.1842 |
| 533 | + */ |
| 534 | +public: |
| 535 | + virtual ~DictNet() {}; |
| 536 | + |
| 537 | + CV_WRAP virtual bool usingGpu()=0; |
| 538 | + /** @brief Constructs a DictNet object from a caffe pretrained model |
| 539 | + * |
| 540 | + * @param archFilename is the path to the prototxt file containing the deployment model architecture description. |
| 541 | + * |
| 542 | + * @param weightsFilename is the path to the pretrained weights of the model in binary fdorm. This file can be |
| 543 | + * very large, up to 2GB. |
| 544 | + * |
| 545 | + * @param minibatchSz the maximum number of samples that can processed in parallel. In practice this parameter |
| 546 | + * has an effect only when computing in the GPU and should be set with respect to the memory available in the GPU. |
| 547 | + * |
| 548 | + * @param useGpu boolean flag setting GPU or CPU computation |
| 549 | + * |
| 550 | + * @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is |
| 551 | + * the only option |
| 552 | + */ |
| 553 | + CV_WRAP static Ptr<DictNet> create(String archFilename,String weightsFilename,int minibatchSz=100,bool useGpu=0,int backEnd=OCR_HOLISTIC_BACKEND_CAFFE); |
| 554 | +}; |
| 555 | + |
| 556 | + |
| 557 | + |
| 558 | +/** @brief OCRHolisticWordRecognizer class provides the functionallity of segmented wordspotting. |
| 559 | + * Given a predefined vocabulary , a TextImageClassifier is employed to select the most probable |
| 560 | + * word given an input image. |
| 561 | + * |
| 562 | + * This class implements the logic of providing transcriptions given a vocabulary and and an image |
| 563 | + * classifer. |
| 564 | + */ |
| 565 | +class CV_EXPORTS_W OCRHolisticWordRecognizer : public BaseOCR |
| 566 | +{ |
| 567 | +public: |
| 568 | + /** @brief Recognize text using a segmentation based word-spotting/classifier cnn. |
| 569 | +
|
| 570 | + Takes image on input and returns recognized text in the output_text parameter. Optionally |
| 571 | + provides also the Rects for individual text elements found (e.g. words), and the list of those |
| 572 | + text elements with their confidence values. |
| 573 | +
|
| 574 | + @param image Input image CV_8UC1 or CV_8UC3 |
| 575 | +
|
| 576 | + @param output_text Output text of the the word spoting, always one that exists in the dictionary. |
| 577 | +
|
| 578 | + @param component_rects Not applicable for word spotting can be be NULL if not, a single elemnt will |
| 579 | + be put in the vector. |
| 580 | +
|
| 581 | + @param component_texts Not applicable for word spotting can be be NULL if not, a single elemnt will |
| 582 | + be put in the vector. |
| 583 | +
|
| 584 | + @param component_confidences Not applicable for word spotting can be be NULL if not, a single elemnt will |
| 585 | + be put in the vector. |
| 586 | +
|
| 587 | + @param component_level must be OCR_LEVEL_WORD. |
| 588 | +
|
| 589 | + @param mask is totally ignored and is only available for compatibillity reasons |
| 590 | + */ |
| 591 | + |
| 592 | + virtual void run(Mat& image, std::string& output_text, std::vector<Rect>* component_rects=NULL, |
| 593 | + std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL, |
| 594 | + int component_level=OCR_LEVEL_WORD)=0; |
| 595 | + |
| 596 | + virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL, |
| 597 | + std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL, |
| 598 | + int component_level=OCR_LEVEL_WORD)=0; |
| 599 | + |
| 600 | + |
| 601 | + /** |
| 602 | + @brief Method that provides a quick and simple interface to a single word image classifcation |
| 603 | +
|
| 604 | + @param inputImage an image expected to be a CV_U8C1 or CV_U8C3 of any size assumed to contain a single word |
| 605 | +
|
| 606 | + @param transcription an opencv string that will store the detected word transcription |
| 607 | +
|
| 608 | + @param confidence a double that will be updated with the confidence the classifier has for the selected word |
| 609 | + */ |
| 610 | + CV_WRAP virtual void recogniseImage(InputArray inputImage,CV_OUT String& transcription,CV_OUT double& confidence)=0; |
| 611 | + |
| 612 | + /** |
| 613 | + @brief Method that provides a quick and simple interface to a multiple word image classifcation taking advantage |
| 614 | + the classifiers parallel capabilities. |
| 615 | +
|
| 616 | + @param inputImage an list of images expected to be a CV_U8C1 or CV_U8C3 each image can be of any size and is assumed |
| 617 | + to contain a single word. |
| 618 | +
|
| 619 | + @param transcriptions a vector of opencv strings that will store the detected word transcriptions, one for each |
| 620 | + input image |
| 621 | +
|
| 622 | + @param confidences a vector of double that will be updated with the confidence the classifier has for each of the |
| 623 | + selected words. |
| 624 | + */ |
| 625 | + CV_WRAP virtual void recogniseImageBatch(InputArrayOfArrays inputImageList,CV_OUT std::vector<String>& transcriptions,CV_OUT std::vector<double>& confidences)=0; |
| 626 | + |
| 627 | + |
| 628 | + /** |
| 629 | + @brief simple getted for the vocabulary employed |
| 630 | + */ |
| 631 | + CV_WRAP virtual const std::vector<String>& getVocabulary()=0; |
| 632 | + |
| 633 | + |
| 634 | + /** @brief Creates an instance of the OCRHolisticWordRecognizer class. |
| 635 | +
|
| 636 | + @param classifierPtr an instance of TextImageClassifier, normaly a DictNet instance |
| 637 | + @param vocabullaryFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line |
| 638 | + in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize |
| 639 | + of the classifier. |
| 640 | + */ |
| 641 | + CV_WRAP static Ptr<OCRHolisticWordRecognizer> create(Ptr<TextImageClassifier> classifierPtr,String vocabullaryFilename); |
| 642 | + /** @brief Creates an instance of the OCRHolisticWordRecognizer class and implicitly also a DictNet classifier. |
| 643 | +
|
| 644 | + @param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture. |
| 645 | + @param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form. |
| 646 | + @param vocabullaryFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line |
| 647 | + in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize |
| 648 | + of the classifier. |
| 649 | + */ |
| 650 | + CV_WRAP static Ptr<OCRHolisticWordRecognizer> create(String modelArchFilename, String modelWeightsFilename, String vocabullaryFilename); |
| 651 | + |
| 652 | +}; |
| 653 | + |
| 654 | + |
468 | 655 | }
|
469 | 656 | }
|
| 657 | + |
| 658 | + |
470 | 659 | #endif // _OPENCV_TEXT_OCR_HPP_
|
0 commit comments