Skip to content

Commit 94e452b

Browse files
committed
Improve PDF search of names
1 parent 2bac5a9 commit 94e452b

File tree

2 files changed

+37
-18
lines changed

2 files changed

+37
-18
lines changed

app/models/name.rb

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1323,6 +1323,31 @@ def check(type)
13231323
checks.find { |check| check.kind == type.to_s }
13241324
end
13251325

1326+
def text_variants
1327+
@text_variants ||= Set.new
1328+
return @text_variants if @text_variants.any?
1329+
1330+
b = [base_name, corrigendum_from].compact
1331+
@text_variants += b
1332+
if %w[subspecies].include?(inferred_rank)
1333+
@text_variants += b.map { |i| i.sub(/ subsp\. /, ' ') }
1334+
end
1335+
if %w[species subspecies].include?(inferred_rank)
1336+
@text_variants += b.map { |i| i.gsub(/^(\S)\S+\s/, '\\1. ') }
1337+
end
1338+
@text_variants
1339+
end
1340+
1341+
def pdf_variants
1342+
@pdf_variants ||= Set.new
1343+
return @pdf_variants if @pdf_variants.any?
1344+
1345+
@pdf_variants += text_variants
1346+
@pdf_variants += text_variants.map { |i| i.split('').join(' ') }
1347+
@pdf_variants += text_variants.map { |i| i.gsub(' ', '') }
1348+
@pdf_variants
1349+
end
1350+
13261351
def fresh_name_order
13271352
y = ''
13281353
if parent && parent.rank_index < rank_index

app/models/register/status.rb

Lines changed: 12 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -280,44 +280,38 @@ def automated_validation
280280
# IMPORTANT: Notes are soft-registered, remember to +save+ to make them
281281
# persistent
282282
def check_pdf_files
283-
has_acc = false
284283
inames = Hash[names.map { |n| [n, false] }]
285284
anames = Hash[names.map { |n| [n, false] }]
286285
[publication_pdf, supplementary_pdf].each do |as|
287-
break if has_acc && inames.values.all?
288286
next unless as.attached?
287+
break if anames.values.all? && inames.values.all?
289288

290289
as.open do |file|
291290
render = PDF::Reader.new(file.path)
292291
render.pages.each do |page|
293-
txt = page.text
294-
has_acc = true if txt.index(accession)
295-
inames.each_key do |n|
296-
bn = n.base_name
297-
cn = n.corrigendum_from
298-
inames[n] = true if txt.index(bn) || (cn && txt.index(cn))
299-
anames[n] = true if txt.index(n.seqcode_url(false))
292+
txt = page.text.unicode_normalize(:nfkc)
293+
anames.each { |n, _| anames[n] = true } if txt.index(accession)
294+
names.each do |n|
295+
inames[n] ||= n.pdf_variants.find { |i| txt.index(i) }.present?
296+
anames[n] ||= txt.index(n.seqcode_url(false)).present?
300297
end
301-
break if (has_acc || anames.values.all?) && inames.values.all?
298+
break if anames.values.all? && inames.values.all?
302299
end
303300
end
304301
end
305302

306303
names.each do |n|
307-
v = has_acc || anames[n]
308-
Check.create_with(pass: v).find_or_create_by(
304+
Check.create_with(pass: anames[n]).find_or_create_by(
309305
name: n, kind: :effective_publication_missing_accession
310-
).update(pass: v)
306+
).update(pass: anames[n])
311307

312-
v = inames[n]
313-
Check.create_with(pass: v).find_or_create_by(
308+
Check.create_with(pass: inames[n]).find_or_create_by(
314309
name: n, kind: :name_missing_in_effective_publication
315-
).update(pass: v)
310+
).update(pass: inames[n])
316311
end
317312

318313
add_note('The effective publication files have been parsed')
319-
320-
has_acc && inames.values.all?
314+
anames.values.all? && inames.values.all?
321315
rescue => e
322316
add_note('ERROR: The effective publication files could not be parsed')
323317
raise e

0 commit comments

Comments
 (0)