Skip to content

Commit b093168

Browse files
author
Gerit Wagner
committed
add
1 parent 97d71d4 commit b093168

File tree

7 files changed

+454
-12
lines changed

7 files changed

+454
-12
lines changed

bib_dedupe/match.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,8 @@ def __get_maybe_pairs(pairs: pd.DataFrame, true_pairs: pd.DataFrame) -> pd.DataF
132132
)
133133
]
134134

135+
# TODO: drop cases from maybe where one contains "part 1", the other "part 2"
136+
135137
# Add a label column to each dataframe
136138
maybe_pairs[DUPLICATE_LABEL] = MAYBE
137139
# Select the ID_1, SEARCH_SET_1 and ID_2, SEARCH_SET_2 fields and the new label column

bib_dedupe/match_conditions.py

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,11 +59,12 @@ def both_entrytypes(entrytype: str) -> str:
5959
# - Queries are better for debugging (understanding which conditions do/do not apply)
6060
# https://jakevdp.github.io/PythonDataScienceHandbook/03.12-performance-eval-and-query.html
6161

62+
6263
duplicate_conditions = [
6364
# Substantial differences in one of AUTHOR/TITLE/CONTAINER_TITLE
6465
f"({au07_ti10_ct10} & {match(VOLUME, PAGES)})",
6566
f"({au07_ti10_ct10} & {non_contradicting(VOLUME, NUMBER, PAGES, YEAR, DOI)})",
66-
f"({au10_ti07_ct10} & {non_contradicting(NUMBER, PAGES, YEAR, DOI)})",
67+
f"({au10_ti07_ct10} & {non_contradicting(VOLUME, NUMBER, PAGES, YEAR, DOI)})",
6768
f"({au10_ti10_ct07} & {non_contradicting(VOLUME, NUMBER, PAGES, YEAR, DOI)})",
6869
# Differences across AUTHOR/TITLE/CONTAINER_TITLE
6970
f"({au08_ti09_ct09} & {non_contradicting(VOLUME, NUMBER, YEAR, DOI)} & {PAGES} > 0.75 )",
@@ -78,7 +79,7 @@ def both_entrytypes(entrytype: str) -> str:
7879
f'({au095_ti09_ct075} & {both_entrytypes("inproceedings")} & {match(YEAR)})', # Inproceedings
7980
f"({au07_ti10_ct10} & {DOI} > 0.9)", # Updates
8081
# no AUTHOR
81-
f"({auXX_ti095_ct095} & {non_contradicting(VOLUME, NUMBER, PAGES, YEAR, DOI)})",
82+
f'({auXX_ti095_ct095} & {non_contradicting(VOLUME, NUMBER, PAGES, YEAR, DOI)}) & {both_entrytypes("article")}',
8283
f"({auXX_ti095_ct095} & {match(VOLUME, NUMBER, PAGES, YEAR)} & {non_contradicting(DOI, ABSTRACT)})",
8384
# no CONTAINER_TITLE
8485
f"({au10_ti10_ctNC} & {match(VOLUME, YEAR)} & {non_contradicting(NUMBER, PAGES, DOI, ABSTRACT)})",
@@ -92,13 +93,26 @@ def both_entrytypes(entrytype: str) -> str:
9293
f"(({match(DOI)} & ~(doi_1 == '' | doi_2 == '')) & ({TITLE} > 0.95) & ({AUTHOR} > 0.9) & ({YEAR} > 0.9)) & {non_contradicting(CONTAINER_TITLE)} ",
9394
# no TITLE
9495
f"({au10_tiXX_ct10} & {match(VOLUME, NUMBER, PAGES, YEAR)} & {non_contradicting(DOI)} & ({ABSTRACT} > 0.95 | {non_contradicting(ABSTRACT)}))", # typically for number-mismatches in title
96+
# early_view_vs_final
97+
f"({au095_ti09_ct075}"
98+
f" & {non_contradicting(DOI)}"
99+
f" & ((volume_1 != '' & volume_2 == '') | (volume_2 != '' & volume_1 == ''))"
100+
f" & ((number_1 != '' & number_2 == '') | (number_2 != '' & number_1 == '') | {non_contradicting(NUMBER)})"
101+
f" & (pages_1.str.match('^1[-–]') | pages_2.str.match('^1[-–]'))"
102+
f")",
95103
]
96104

97105
non_duplicate_conditions = [
98106
f"({mismatch(YEAR)} & ~({match(VOLUME)} | {match(NUMBER)} | {match(PAGES)} | {match(DOI)} | {match(CONTAINER_TITLE)}))",
99107
f'({mismatch(TITLE)} & ({PAGE_RANGES_ADJACENT} == "adjacent" | {PAGE_RANGES_ADJACENT} == "non_overlapping"))',
100108
f"(~(doi_1 == '' | doi_2 == '') & {DOI} < 0.8 & ~({non_contradicting(AUTHOR, TITLE, YEAR, CONTAINER_TITLE, VOLUME, NUMBER, PAGES)}))",
101-
f"({mismatch(VOLUME, NUMBER, PAGES)})",
109+
# f"({mismatch(VOLUME, NUMBER, PAGES)})",
110+
f"({mismatch(VOLUME, NUMBER, PAGES)}"
111+
f" & ~({au095_ti09_ct075}"
112+
f" & ((volume_1 != '' & volume_2 == '') | (volume_2 != '' & volume_1 == ''))"
113+
f" & (pages_1.str.match('^1[-–]') | pages_2.str.match('^1[-–]'))"
114+
f" )"
115+
f")",
102116
# Editorials: minor differences in volume/number/pages can be meaningful
103117
f'(title_1.str.contains("editor") & title_1.str.len() < 60 & ( {mismatch(VOLUME)} | {mismatch(NUMBER)} | {mismatch(PAGES)}))',
104118
# Journal vs. conference/workshop
@@ -110,4 +124,14 @@ def both_entrytypes(entrytype: str) -> str:
110124
f' ~({CONTAINER_TITLE}_2.str.contains("conf") | {CONTAINER_TITLE}_2.str.contains("work") | {CONTAINER_TITLE}_2.str.contains("proc")) ) & '
111125
f' ( ({CONTAINER_TITLE}_1.str.contains("conf") | {CONTAINER_TITLE}_1.str.contains("work") | {CONTAINER_TITLE}_1.str.contains("proc")) & '
112126
f' ~{CONTAINER_TITLE}_1.str.contains("j") ))',
127+
# Inproceedings: more sensitive to year mismatches
128+
f'({both_entrytypes("inproceedings")} & {mismatch(YEAR)})',
129+
# TODO : we may need to pre-compute this:
130+
# https://chatgpt.com/c/695554d0-a548-8332-9921-ec28332246fd
131+
# NEW: container title appears inside (either) title => treat as non-duplicate with anything else
132+
# (e.g., "Communications of the Association for Information Systems ... Volume 52 Paper 41")
133+
# f'(container_title_1 != "" & title_1 != "" & '
134+
# f' title_1.str.contains(container_title_1.str.lower(), na=False))'
135+
# f' | (container_title_2 != "" & title_2 != "" & '
136+
# f' title_2.str.contains(container_title_2.str.lower(), na=False))',
113137
]

bib_dedupe/prep_title.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,13 @@ def remove_erratum_suffix(title: str) -> str:
4242
return title
4343

4444

45+
def remove_authors_personal_copy(title: str) -> str:
46+
# Remove "Author's personal copy" at start or end (case-insensitive),
47+
# tolerating surrounding punctuation/whitespace.
48+
pattern = r"^\s*author s personal copy[\s\-\–—:;,.]*|[\s\-\–—:;,.]*author s personal copy\s*$"
49+
return re.sub(pattern, "", title).strip()
50+
51+
4552
# flake8: noqa: E501
4653
# pylint: disable=line-too-long
4754
def prep_title(title_array: np.array) -> np.array:
@@ -175,8 +182,16 @@ def prep_title(title_array: np.array) -> np.array:
175182
]
176183
)
177184

185+
# Remove "Author's personal copy" at beginning/end
186+
title_array = np.array(
187+
[remove_authors_personal_copy(title) for title in title_array]
188+
)
189+
178190
# Replace multiple spaces with a single space
179191
title_array = np.array(
180192
[re.sub(r"\s+", " ", title).rstrip().lstrip() for title in title_array]
181193
)
182194
return title_array
195+
196+
197+
# TODO : organize more efficiently, see: https://chatgpt.com/c/6967a4de-d35c-8333-8a3b-8e5f8ea3329c

bib_dedupe/sim.py

Lines changed: 57 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ def sim_title(title_1: str, title_2: str, debug: bool = False) -> float:
126126
t1 = str(title_1)
127127
t2 = str(title_2)
128128

129-
if t1 == "" and t2 == "":
129+
if t1 in ["", "book title"] or t2 in ["", "book title"]:
130130
return 0.0
131131

132132
if t1.replace(" ", "") == t2.replace(" ", "") and t1.replace(" ", "") != "":
@@ -140,6 +140,10 @@ def sim_title(title_1: str, title_2: str, debug: bool = False) -> float:
140140
"comment",
141141
"response",
142142
"reply",
143+
"update",
144+
"forum",
145+
"proposed",
146+
"talk",
143147
]
144148
]
145149
):
@@ -314,6 +318,40 @@ def sim_volume(v1_str: str, v2_str: str) -> float:
314318
return 0.0
315319

316320

321+
# def _norm_abstract(text: str) -> str:
322+
# text = "" if text is None else str(text)
323+
324+
# # normalize common patterns
325+
# text = re.sub(r"([a-z])\s+(\d)", r"\1\2", text)
326+
# text = re.sub(r"([a-z])\s+([a-z])", r"\1\2", text)
327+
328+
# # remove punctuation (keep letters/numbers/spaces)
329+
# text = re.sub(r"[^a-z0-9\s]", " ", text)
330+
331+
# # collapse whitespace
332+
# text = re.sub(r"\s+", " ", text).strip()
333+
# return text
334+
335+
336+
# def sim_abstract(abstract_1: str, abstract_2: str) -> float:
337+
# a1 = _norm_abstract(abstract_1)
338+
# a2 = _norm_abstract(abstract_2)
339+
340+
# if not a1 or not a2:
341+
# return 0.0
342+
343+
# # If one is essentially a prefix/subsequence of the other (truncated abstract),
344+
# # partial_ratio will capture it much better than ratio.
345+
# s_ratio = fuzz.ratio(a1, a2) / 100.0
346+
# s_partial = fuzz.partial_ratio(a1, a2) / 100.0
347+
348+
# # token_set helps when words are same but order/noise differs
349+
# s_token = fuzz.token_set_ratio(a1, a2) / 100.0
350+
351+
# # take the best signal; you can also blend (see below)
352+
# return max(s_ratio, s_partial, s_token)
353+
354+
317355
def _norm_abstract(text: str) -> str:
318356
text = "" if text is None else str(text)
319357

@@ -329,22 +367,32 @@ def _norm_abstract(text: str) -> str:
329367
return text
330368

331369

332-
def sim_abstract(abstract_1: str, abstract_2: str) -> float:
333-
a1 = _norm_abstract(abstract_1)
334-
a2 = _norm_abstract(abstract_2)
370+
def sim_abstract(a1: str, a2: str) -> float:
371+
# a1 = _norm_abstract(a1)
372+
# a2 = _norm_abstract(a2)
335373

336374
if not a1 or not a2:
337375
return 0.0
338376

339-
# If one is essentially a prefix/subsequence of the other (truncated abstract),
340-
# partial_ratio will capture it much better than ratio.
377+
# fast path 1: exact match after normalization
378+
if a1 == a2:
379+
return 1.0
380+
381+
# fast path 2: truncated/prefix abstracts (cheap)
382+
if len(a1) > 500 and len(a2) > 500:
383+
tail = 100
384+
if len(a1) > tail and len(a2) > tail:
385+
if a1.startswith(a2[:-tail]) or a2.startswith(a1[:-tail]):
386+
return 1.0
387+
341388
s_ratio = fuzz.ratio(a1, a2) / 100.0
389+
if s_ratio == 1.0:
390+
return 1.0
342391
s_partial = fuzz.partial_ratio(a1, a2) / 100.0
343-
344-
# token_set helps when words are same but order/noise differs
392+
if s_partial == 1.0:
393+
return 1.0
345394
s_token = fuzz.token_set_ratio(a1, a2) / 100.0
346395

347-
# take the best signal; you can also blend (see below)
348396
return max(s_ratio, s_partial, s_token)
349397

350398

0 commit comments

Comments
 (0)