Skip to content

Commit c1f5fd0

Browse files
author
Gerit Wagner
committed
update sim (abstract)
1 parent b3ace4b commit c1f5fd0

File tree

1 file changed

+27
-9
lines changed

1 file changed

+27
-9
lines changed

bib_dedupe/sim.py

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -314,20 +314,38 @@ def sim_volume(v1_str: str, v2_str: str) -> float:
314314
return 0.0
315315

316316

317+
def _norm_abstract(text: str) -> str:
318+
text = "" if text is None else str(text)
319+
320+
# normalize common patterns
321+
text = re.sub(r"([a-z])\s+(\d)", r"\1\2", text)
322+
text = re.sub(r"([a-z])\s+([a-z])", r"\1\2", text)
323+
324+
# remove punctuation (keep letters/numbers/spaces)
325+
text = re.sub(r"[^a-z0-9\s]", " ", text)
326+
327+
# collapse whitespace
328+
text = re.sub(r"\s+", " ", text).strip()
329+
return text
330+
331+
317332
def sim_abstract(abstract_1: str, abstract_2: str) -> float:
318-
abstract_1 = str(abstract_1)
319-
abstract_2 = str(abstract_2)
333+
a1 = _norm_abstract(abstract_1)
334+
a2 = _norm_abstract(abstract_2)
320335

321-
if abstract_1 == "" or abstract_2 == "":
336+
if not a1 or not a2:
322337
return 0.0
323338

324-
if len(abstract_1) > 500 and len(abstract_2) > 500:
325-
if abstract_1.startswith(abstract_2[:-100]) or abstract_2.startswith(
326-
abstract_1[:-100]
327-
):
328-
return 1.0
339+
# If one is essentially a prefix/subsequence of the other (truncated abstract),
340+
# partial_ratio will capture it much better than ratio.
341+
s_ratio = fuzz.ratio(a1, a2) / 100.0
342+
s_partial = fuzz.partial_ratio(a1, a2) / 100.0
343+
344+
# token_set helps when words are same but order/noise differs
345+
s_token = fuzz.token_set_ratio(a1, a2) / 100.0
329346

330-
return fuzz.ratio(abstract_1, str(abstract_2)) / 100
347+
# take the best signal; you can also blend (see below)
348+
return max(s_ratio, s_partial, s_token)
331349

332350

333351
def sim_container_title(container_1: str, container_2: str) -> float:

0 commit comments

Comments
 (0)