Skip to content

Commit 96861b5

Browse files
authored
Add new parameter for invert_threshold (#3852)
Change default value from 0.5 to 0.7. Signed-off-by: Stefan Weil <[email protected]>
1 parent 0df584e commit 96861b5

File tree

6 files changed

+20
-12
lines changed

6 files changed

+20
-12
lines changed

src/ccmain/linerec.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,8 @@ void Tesseract::LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word,
250250
}
251251

252252
bool do_invert = tessedit_do_invert;
253-
lstm_recognizer_->RecognizeLine(*im_data, do_invert, classify_debug_level > 0,
253+
float threshold = do_invert ? double(invert_threshold) : 0.0f;
254+
lstm_recognizer_->RecognizeLine(*im_data, threshold, classify_debug_level > 0,
254255
kWorstDictCertainty / kCertaintyScale, word_box, words,
255256
lstm_choice_mode, lstm_choice_iterations);
256257
delete im_data;

src/ccmain/tesseractclass.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,11 @@ Tesseract::Tesseract()
6363
"Break input into lines and remap boxes if present", this->params())
6464
, BOOL_MEMBER(tessedit_dump_pageseg_images, false,
6565
"Dump intermediate images made during page segmentation", this->params())
66-
, BOOL_MEMBER(tessedit_do_invert, true, "Try inverting the image in `LSTMRecognizeWord`",
66+
, BOOL_MEMBER(tessedit_do_invert, true, "Try inverted line image if necessary",
6767
this->params())
68+
, double_MEMBER(invert_threshold, 0.7,
69+
"For lines with a mean confidence below this value, OCR is also tried with an inverted image",
70+
this->params())
6871
,
6972
// The default for pageseg_mode is the old behaviour, so as not to
7073
// upset anything that relies on that.

src/ccmain/tesseractclass.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -756,6 +756,7 @@ class TESS_API Tesseract : public Wordrec {
756756
BOOL_VAR_H(tessedit_train_line_recognizer);
757757
BOOL_VAR_H(tessedit_dump_pageseg_images);
758758
BOOL_VAR_H(tessedit_do_invert);
759+
double_VAR_H(invert_threshold);
759760
INT_VAR_H(tessedit_pageseg_mode);
760761
INT_VAR_H(thresholding_method);
761762
BOOL_VAR_H(thresholding_debug);

src/lstm/lstmrecognizer.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -244,14 +244,15 @@ bool LSTMRecognizer::LoadDictionary(const ParamsVectors *params, const std::stri
244244

245245
// Recognizes the line image, contained within image_data, returning the
246246
// ratings matrix and matching box_word for each WERD_RES in the output.
247-
void LSTMRecognizer::RecognizeLine(const ImageData &image_data, bool invert, bool debug,
247+
void LSTMRecognizer::RecognizeLine(const ImageData &image_data,
248+
float invert_threshold, bool debug,
248249
double worst_dict_cert, const TBOX &line_box,
249250
PointerVector<WERD_RES> *words, int lstm_choice_mode,
250251
int lstm_choice_amount) {
251252
NetworkIO outputs;
252253
float scale_factor;
253254
NetworkIO inputs;
254-
if (!RecognizeLine(image_data, invert, debug, false, false, &scale_factor, &inputs, &outputs)) {
255+
if (!RecognizeLine(image_data, invert_threshold, debug, false, false, &scale_factor, &inputs, &outputs)) {
255256
return;
256257
}
257258
if (search_ == nullptr) {
@@ -317,7 +318,8 @@ void LSTMRecognizer::OutputStats(const NetworkIO &outputs, float *min_output, fl
317318

318319
// Recognizes the image_data, returning the labels,
319320
// scores, and corresponding pairs of start, end x-coords in coords.
320-
bool LSTMRecognizer::RecognizeLine(const ImageData &image_data, bool invert, bool debug,
321+
bool LSTMRecognizer::RecognizeLine(const ImageData &image_data,
322+
float invert_threshold, bool debug,
321323
bool re_invert, bool upside_down, float *scale_factor,
322324
NetworkIO *inputs, NetworkIO *outputs) {
323325
// This ensures consistent recognition results.
@@ -345,10 +347,10 @@ bool LSTMRecognizer::RecognizeLine(const ImageData &image_data, bool invert, boo
345347
Input::PreparePixInput(network_->InputShape(), pix, &randomizer_, inputs);
346348
network_->Forward(debug, *inputs, nullptr, &scratch_space_, outputs);
347349
// Check for auto inversion.
348-
if (invert) {
350+
if (invert_threshold > 0.0f) {
349351
float pos_min, pos_mean, pos_sd;
350352
OutputStats(*outputs, &pos_min, &pos_mean, &pos_sd);
351-
if (pos_mean < 0.5f) {
353+
if (pos_mean < invert_threshold) {
352354
// Run again inverted and see if it is any better.
353355
NetworkIO inv_inputs, inv_outputs;
354356
inv_inputs.set_int_mode(IsIntMode());

src/lstm/lstmrecognizer.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -244,11 +244,12 @@ class TESS_API LSTMRecognizer {
244244

245245
// Recognizes the line image, contained within image_data, returning the
246246
// recognized tesseract WERD_RES for the words.
247-
// If invert, tries inverted as well if the normal interpretation doesn't
248-
// produce a good enough result. The line_box is used for computing the
247+
// If invert_threshold > 0, tries inverted as well if the normal
248+
// interpretation doesn't produce a result which at least reaches
249+
// that threshold. The line_box is used for computing the
249250
// box_word in the output words. worst_dict_cert is the worst certainty that
250251
// will be used in a dictionary word.
251-
void RecognizeLine(const ImageData &image_data, bool invert, bool debug, double worst_dict_cert,
252+
void RecognizeLine(const ImageData &image_data, float invert_threshold, bool debug, double worst_dict_cert,
252253
const TBOX &line_box, PointerVector<WERD_RES> *words, int lstm_choice_mode = 0,
253254
int lstm_choice_amount = 5);
254255

@@ -263,7 +264,7 @@ class TESS_API LSTMRecognizer {
263264
// improve the results. This ensures that outputs contains the correct
264265
// forward outputs for the best photometric interpretation.
265266
// inputs is filled with the used inputs to the network.
266-
bool RecognizeLine(const ImageData &image_data, bool invert, bool debug, bool re_invert,
267+
bool RecognizeLine(const ImageData &image_data, float invert_threshold, bool debug, bool re_invert,
267268
bool upside_down, float *scale_factor, NetworkIO *inputs, NetworkIO *outputs);
268269

269270
// Converts an array of labels to utf-8, whether or not the labels are

src/training/unicharset/lstmtrainer.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -948,7 +948,7 @@ Trainability LSTMTrainer::PrepareForBackward(const ImageData *trainingdata,
948948
float image_scale;
949949
NetworkIO inputs;
950950
bool invert = trainingdata->boxes().empty();
951-
if (!RecognizeLine(*trainingdata, invert, debug, invert, upside_down,
951+
if (!RecognizeLine(*trainingdata, invert ? 0.5f : 0.0f, debug, invert, upside_down,
952952
&image_scale, &inputs, fwd_outputs)) {
953953
tprintf("Image %s not trainable\n", trainingdata->imagefilename().c_str());
954954
return UNENCODABLE;

0 commit comments

Comments
 (0)