Skip to content

Commit 81e40d8

Browse files
fix basis, optimize & tweak the rest
1 parent 78b56d6 commit 81e40d8

File tree

1 file changed

+33
-17
lines changed

1 file changed

+33
-17
lines changed

lib/mindee/extraction/tax_extractor/tax_extractor.rb

Lines changed: 33 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
require_relative 'ocr_extractor'
44

5+
# rubocop:disable Metrics/ClassLength
6+
57
module Mindee
68
module Extraction
79
# Tax extractor class
@@ -72,9 +74,12 @@ def self.curate_values(found_hash, min_rate_percentage, max_rate_percentage)
7274
reconstructed_hash['code'] =
7375
found_hash['code'].nil? ? found_hash['code'] : found_hash['code'].sub(%r{\s*\.*\s*$}, '')
7476

75-
if found_hash['rate'] && found_hash['rate'] < 1 && (found_hash['rate']).positive?
76-
found_hash['rate'] =
77-
found_hash['rate'] * 100
77+
if found_hash['rate']
78+
if found_hash['rate'].abs < 1
79+
found_hash['rate'] *= 10
80+
elsif found_hash['rate'].abs > 100
81+
found_hash['rate'] /= 10
82+
end
7883
end
7984
found_hash = swap_rates_if_needed(found_hash, min_rate_percentage, max_rate_percentage)
8085
found_hash = decimate_rates_if_needed(found_hash)
@@ -125,17 +130,28 @@ def self.decimate_rates_if_needed(found_hash)
125130
# @param found_hash [Hash] Hash of currently retrieved values
126131
# @return [Hash]
127132
def self.set_base_and_value(reconstructed_hash, found_hash)
128-
if !found_hash['base'].nil? && !found_hash['value'].nil? && found_hash['base'] > found_hash['value']
129-
reconstructed_hash['base'] = found_hash['value']
130-
reconstructed_hash['value'] = found_hash['base']
131-
else
132-
reconstructed_hash['base'] = found_hash['base']
133-
reconstructed_hash['value'] = found_hash['value']
133+
base = found_hash['base']
134+
value = found_hash['value']
135+
136+
if base && value
137+
reconstructed_hash['base'], reconstructed_hash['value'] = [base, value].minmax
138+
elsif base
139+
reconstructed_hash['base'] = base
140+
elsif value
141+
reconstructed_hash['value'] = value
142+
calculate_base(reconstructed_hash)
134143
end
135144

136145
reconstructed_hash
137146
end
138147

148+
def self.calculate_base(hash)
149+
rate = hash['rate']
150+
return unless rate&.positive?
151+
152+
hash['base'] = hash['value'] / (rate / 100.0)
153+
end
154+
139155
# Extracts a single custom type of tax.
140156
# For the sake of simplicity, this only extracts the first example, unless specifically instructed otherwise.
141157
# @param ocr_result [Mindee::Parsing::Common::Ocr::Ocr] result of the OCR.
@@ -148,7 +164,6 @@ def self.extract_custom_tax(ocr_result, tax_names, min_rate_percentage = 0, max_
148164

149165
tax_names.sort!
150166
found_hash = pick_best(extract_horizontal_tax(ocr_result, tax_names), tax_names)
151-
# a tax is considered found horizontally if it has a value, otherwise it is vertical
152167
if found_hash.nil? || found_hash['value'].nil?
153168
found_hash = extract_vertical_tax(ocr_result, tax_names,
154169
found_hash)
@@ -239,14 +254,14 @@ def self.extract_horizontal_tax(ocr_result, tax_names)
239254
linear_pattern_percent_first = %r{
240255
((?:\s*-\s*)?(?:\d*[.,])*\d+[ ]?%?|%?[ ]?(?:\s*-\s*)?(?:\d*[.,])*\d+)?[ .]?
241256
([a-zA-ZÀ-ÖØ-öø-ÿ .]*[a-zA-ZÀ-ÖØ-öø-ÿ]?)[ .]?
242-
((?:\s*-\s*)?(?:\d*[.,])+\d{2,})?[ .]*
243-
((?:\s*-\s*)?(\d*[.,])*\d{2,})?
257+
((?:\s*-\s*)?(?:\d*[.,])+\d+)?[ .]*
258+
((?:\s*-\s*)?(\d*[.,])*\d+)?
244259
}x
245260
linear_pattern_percent_second = %r{
246261
([a-zA-ZÀ-ÖØ-öø-ÿ .]*[a-zA-ZÀ-ÖØ-öø-ÿ]?)[ .]*
247262
((?:\s*-\s*)?(?:\d*[.,])*\d+[ ]?%?|%?[ ]?(?:\s*-\s*)?(?:\d*[.,])*\d+)?[ .]?
248-
((?:\s*-\s*)?(?:\d*[.,])+\d{2,})?[ .]*
249-
((?:\s*-\s*)?(\d*[.,])*\d{2,})?
263+
((?:\s*-\s*)?(?:\d*[.,])+\d+)?[ .]*
264+
((?:\s*-\s*)?(\d*[.,])*\d+)?
250265
}x
251266
ocr_result.mvision_v1.pages.each.with_index do |page, page_id|
252267
page.all_lines.each do |line|
@@ -303,7 +318,7 @@ def self.extract_vertical_tax(ocr_result, tax_names, found_hash)
303318
page.all_words.each do |word|
304319
next if match_index(word.text, tax_names).nil?
305320

306-
reconstructed_line = ocr_result.reconstruct_vertically(word.polygon, page_id)
321+
reconstructed_line = ocr_result.reconstruct_vertically(word.polygon, page_id, 0.25)
307322
found_hash['page_id'] = page_id if found_hash['page_id'].nil?
308323
found_hash['code'] = word.text.strip if found_hash['code'].nil?
309324
found_hash = extract_vertical_tax_values(reconstructed_line, found_hash)
@@ -315,8 +330,9 @@ def self.extract_vertical_tax(ocr_result, tax_names, found_hash)
315330
private_class_method :extract_percentage_from_tax, :extract_basis_and_value, :extract_tax_from_horizontal_line,
316331
:extract_horizontal_tax, :extract_vertical_tax_values, :extract_vertical_tax,
317332
:create_tax_field, :fix_rate, :pick_best, :calculate_score, :curate_values,
318-
:decimate_rates_if_needed, :extract_basis_and_value, :set_base_and_value, :valid_candidate?,
319-
:swap_rates_if_needed
333+
:decimate_rates_if_needed, :set_base_and_value, :valid_candidate?,
334+
:swap_rates_if_needed, :calculate_base
320335
end
321336
end
322337
end
338+
# rubocop:enable Metrics/ClassLength

0 commit comments

Comments
 (0)