22
33require_relative 'ocr_extractor'
44
5+ # rubocop:disable Metrics/ClassLength
6+
57module Mindee
68 module Extraction
79 # Tax extractor class
@@ -72,9 +74,12 @@ def self.curate_values(found_hash, min_rate_percentage, max_rate_percentage)
7274 reconstructed_hash [ 'code' ] =
7375 found_hash [ 'code' ] . nil? ? found_hash [ 'code' ] : found_hash [ 'code' ] . sub ( %r{\s *\. *\s *$} , '' )
7476
75- if found_hash [ 'rate' ] && found_hash [ 'rate' ] < 1 && ( found_hash [ 'rate' ] ) . positive?
76- found_hash [ 'rate' ] =
77- found_hash [ 'rate' ] * 100
77+ if found_hash [ 'rate' ]
78+ if found_hash [ 'rate' ] . abs < 1
79+ found_hash [ 'rate' ] *= 10
80+ elsif found_hash [ 'rate' ] . abs > 100
81+ found_hash [ 'rate' ] /= 10
82+ end
7883 end
7984 found_hash = swap_rates_if_needed ( found_hash , min_rate_percentage , max_rate_percentage )
8085 found_hash = decimate_rates_if_needed ( found_hash )
@@ -125,17 +130,28 @@ def self.decimate_rates_if_needed(found_hash)
125130 # @param found_hash [Hash] Hash of currently retrieved values
126131 # @return [Hash]
127132 def self . set_base_and_value ( reconstructed_hash , found_hash )
128- if !found_hash [ 'base' ] . nil? && !found_hash [ 'value' ] . nil? && found_hash [ 'base' ] > found_hash [ 'value' ]
129- reconstructed_hash [ 'base' ] = found_hash [ 'value' ]
130- reconstructed_hash [ 'value' ] = found_hash [ 'base' ]
131- else
132- reconstructed_hash [ 'base' ] = found_hash [ 'base' ]
133- reconstructed_hash [ 'value' ] = found_hash [ 'value' ]
133+ base = found_hash [ 'base' ]
134+ value = found_hash [ 'value' ]
135+
136+ if base && value
137+ reconstructed_hash [ 'base' ] , reconstructed_hash [ 'value' ] = [ base , value ] . minmax
138+ elsif base
139+ reconstructed_hash [ 'base' ] = base
140+ elsif value
141+ reconstructed_hash [ 'value' ] = value
142+ calculate_base ( reconstructed_hash )
134143 end
135144
136145 reconstructed_hash
137146 end
138147
148+ def self . calculate_base ( hash )
149+ rate = hash [ 'rate' ]
150+ return unless rate &.positive?
151+
152+ hash [ 'base' ] = hash [ 'value' ] / ( rate / 100.0 )
153+ end
154+
139155 # Extracts a single custom type of tax.
140156 # For the sake of simplicity, this only extracts the first example, unless specifically instructed otherwise.
141157 # @param ocr_result [Mindee::Parsing::Common::Ocr::Ocr] result of the OCR.
@@ -148,7 +164,6 @@ def self.extract_custom_tax(ocr_result, tax_names, min_rate_percentage = 0, max_
148164
149165 tax_names . sort!
150166 found_hash = pick_best ( extract_horizontal_tax ( ocr_result , tax_names ) , tax_names )
151- # a tax is considered found horizontally if it has a value, otherwise it is vertical
152167 if found_hash . nil? || found_hash [ 'value' ] . nil?
153168 found_hash = extract_vertical_tax ( ocr_result , tax_names ,
154169 found_hash )
@@ -239,14 +254,14 @@ def self.extract_horizontal_tax(ocr_result, tax_names)
239254 linear_pattern_percent_first = %r{
240255 ((?:\s *-\s *)?(?:\d *[.,])*\d +[ ]?%?|%?[ ]?(?:\s *-\s *)?(?:\d *[.,])*\d +)?[ .]?
241256 ([a-zA-ZÀ-ÖØ-öø-ÿ .]*[a-zA-ZÀ-ÖØ-öø-ÿ]?)[ .]?
242- ((?:\s *-\s *)?(?:\d *[.,])+\d {2,} )?[ .]*
243- ((?:\s *-\s *)?(\d *[.,])*\d {2,} )?
257+ ((?:\s *-\s *)?(?:\d *[.,])+\d + )?[ .]*
258+ ((?:\s *-\s *)?(\d *[.,])*\d + )?
244259 }x
245260 linear_pattern_percent_second = %r{
246261 ([a-zA-ZÀ-ÖØ-öø-ÿ .]*[a-zA-ZÀ-ÖØ-öø-ÿ]?)[ .]*
247262 ((?:\s *-\s *)?(?:\d *[.,])*\d +[ ]?%?|%?[ ]?(?:\s *-\s *)?(?:\d *[.,])*\d +)?[ .]?
248- ((?:\s *-\s *)?(?:\d *[.,])+\d {2,} )?[ .]*
249- ((?:\s *-\s *)?(\d *[.,])*\d {2,} )?
263+ ((?:\s *-\s *)?(?:\d *[.,])+\d + )?[ .]*
264+ ((?:\s *-\s *)?(\d *[.,])*\d + )?
250265 }x
251266 ocr_result . mvision_v1 . pages . each . with_index do |page , page_id |
252267 page . all_lines . each do |line |
@@ -303,7 +318,7 @@ def self.extract_vertical_tax(ocr_result, tax_names, found_hash)
303318 page . all_words . each do |word |
304319 next if match_index ( word . text , tax_names ) . nil?
305320
306- reconstructed_line = ocr_result . reconstruct_vertically ( word . polygon , page_id )
321+ reconstructed_line = ocr_result . reconstruct_vertically ( word . polygon , page_id , 0.25 )
307322 found_hash [ 'page_id' ] = page_id if found_hash [ 'page_id' ] . nil?
308323 found_hash [ 'code' ] = word . text . strip if found_hash [ 'code' ] . nil?
309324 found_hash = extract_vertical_tax_values ( reconstructed_line , found_hash )
@@ -315,8 +330,9 @@ def self.extract_vertical_tax(ocr_result, tax_names, found_hash)
315330 private_class_method :extract_percentage_from_tax , :extract_basis_and_value , :extract_tax_from_horizontal_line ,
316331 :extract_horizontal_tax , :extract_vertical_tax_values , :extract_vertical_tax ,
317332 :create_tax_field , :fix_rate , :pick_best , :calculate_score , :curate_values ,
318- :decimate_rates_if_needed , :extract_basis_and_value , : set_base_and_value, :valid_candidate? ,
319- :swap_rates_if_needed
333+ :decimate_rates_if_needed , :set_base_and_value , :valid_candidate? ,
334+ :swap_rates_if_needed , :calculate_base
320335 end
321336 end
322337end
338+ # rubocop:enable Metrics/ClassLength
0 commit comments