Skip to content

Commit f92d2e1

Browse files
authored
code (#54)
1 parent 9afccc8 commit f92d2e1

File tree

2 files changed

+14
-3
lines changed

2 files changed

+14
-3
lines changed

bib_dedupe/match.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77

88
import bib_dedupe.match_conditions
99
import bib_dedupe.sim
10-
import bib_dedupe.util
1110
from bib_dedupe import verbose_print
1211
from bib_dedupe.constants.colors import END
1312
from bib_dedupe.constants.colors import GREEN

bib_dedupe/match_conditions.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,21 @@ def mismatch(*keys: str) -> str:
2323

2424

2525
def match(*args: str, threshold: float = 1.0) -> str:
26+
"""
27+
Build a pandas-query expression requiring:
28+
1) similarity meets the threshold, and
29+
2) BOTH raw fields (<arg>_1 and <arg>_2) are non-empty.
30+
31+
Assumes columns like: doi + doi_1 + doi_2, title + title_1 + title_2, ...
32+
"""
2633
if threshold == 1.0:
27-
return "&".join(f" ({arg} == {threshold}) " for arg in args)
28-
return "&".join(f" ({arg} > {threshold}) " for arg in args)
34+
sim_expr = " & ".join(f" ({arg} == 1.0) " for arg in args)
35+
else:
36+
sim_expr = " & ".join(f" ({arg} > {threshold}) " for arg in args)
37+
38+
non_empty_expr = " & ".join(f" ({arg}_1 != '' & {arg}_2 != '') " for arg in args)
39+
40+
return f"({sim_expr}) & ({non_empty_expr})"
2941

3042

3143
def non_contradicting(*keys: str) -> str:

0 commit comments

Comments
 (0)