CoLRev-Environment
diff --git a/‎bib_dedupe/match.py‎
Lines changed: 2 additions & 0 deletions b/‎bib_dedupe/match.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎bib_dedupe/match_conditions.py‎
Lines changed: 27 additions & 3 deletions b/‎bib_dedupe/match_conditions.py‎
Lines changed: 27 additions & 3 deletions
diff --git a/‎bib_dedupe/prep_title.py‎
Lines changed: 15 additions & 0 deletions b/‎bib_dedupe/prep_title.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎bib_dedupe/sim.py‎
Lines changed: 57 additions & 9 deletions b/‎bib_dedupe/sim.py‎
Lines changed: 57 additions & 9 deletions
@@ -132,6 +132,8 @@ def __get_maybe_pairs(pairs: pd.DataFrame, true_pairs: pd.DataFrame) -> pd.DataF
         )
     ]
 
+    # TODO: drop cases from maybe where one contains "part 1", the other "part 2"
+
     # Add a label column to each dataframe
     maybe_pairs[DUPLICATE_LABEL] = MAYBE
     # Select the ID_1, SEARCH_SET_1 and ID_2, SEARCH_SET_2 fields and the new label column
 
@@ -59,11 +59,12 @@ def both_entrytypes(entrytype: str) -> str:
 # - Queries are better for debugging (understanding which conditions do/do not apply)
 #   https://jakevdp.github.io/PythonDataScienceHandbook/03.12-performance-eval-and-query.html
 
+
 duplicate_conditions = [
     # Substantial differences in one of AUTHOR/TITLE/CONTAINER_TITLE
     f"({au07_ti10_ct10} & {match(VOLUME, PAGES)})",
     f"({au07_ti10_ct10} & {non_contradicting(VOLUME, NUMBER, PAGES, YEAR, DOI)})",
-    f"({au10_ti07_ct10} & {non_contradicting(NUMBER, PAGES, YEAR, DOI)})",
+    f"({au10_ti07_ct10} & {non_contradicting(VOLUME, NUMBER, PAGES, YEAR, DOI)})",
     f"({au10_ti10_ct07} & {non_contradicting(VOLUME, NUMBER, PAGES, YEAR, DOI)})",
     # Differences across AUTHOR/TITLE/CONTAINER_TITLE
     f"({au08_ti09_ct09} & {non_contradicting(VOLUME, NUMBER, YEAR, DOI)} & {PAGES} > 0.75 )",
@@ -78,7 +79,7 @@ def both_entrytypes(entrytype: str) -> str:
     f'({au095_ti09_ct075} & {both_entrytypes("inproceedings")} & {match(YEAR)})',  # Inproceedings
     f"({au07_ti10_ct10} & {DOI} > 0.9)",  # Updates
     # no AUTHOR
-    f"({auXX_ti095_ct095} & {non_contradicting(VOLUME, NUMBER, PAGES, YEAR, DOI)})",
+    f'({auXX_ti095_ct095} & {non_contradicting(VOLUME, NUMBER, PAGES, YEAR, DOI)}) & {both_entrytypes("article")}',
     f"({auXX_ti095_ct095} & {match(VOLUME, NUMBER, PAGES, YEAR)} & {non_contradicting(DOI, ABSTRACT)})",
     # no CONTAINER_TITLE
     f"({au10_ti10_ctNC} & {match(VOLUME, YEAR)} & {non_contradicting(NUMBER, PAGES, DOI, ABSTRACT)})",
@@ -92,13 +93,26 @@ def both_entrytypes(entrytype: str) -> str:
     f"(({match(DOI)} & ~(doi_1 == '' | doi_2 == '')) & ({TITLE} > 0.95) & ({AUTHOR} > 0.9) & ({YEAR} > 0.9)) & {non_contradicting(CONTAINER_TITLE)} ",
     # no TITLE
     f"({au10_tiXX_ct10} & {match(VOLUME, NUMBER, PAGES, YEAR)} & {non_contradicting(DOI)} & ({ABSTRACT} > 0.95 | {non_contradicting(ABSTRACT)}))",  # typically for number-mismatches in title
+    # early_view_vs_final
+    f"({au095_ti09_ct075}"
+    f" & {non_contradicting(DOI)}"
+    f" & ((volume_1 != '' & volume_2 == '') | (volume_2 != '' & volume_1 == ''))"
+    f" & ((number_1 != '' & number_2 == '') | (number_2 != '' & number_1 == '') | {non_contradicting(NUMBER)})"
+    f" & (pages_1.str.match('^1[-–]') | pages_2.str.match('^1[-–]'))"
+    f")",
 ]
 
 non_duplicate_conditions = [
     f"({mismatch(YEAR)} & ~({match(VOLUME)} | {match(NUMBER)} | {match(PAGES)} | {match(DOI)} | {match(CONTAINER_TITLE)}))",
     f'({mismatch(TITLE)} & ({PAGE_RANGES_ADJACENT} == "adjacent" | {PAGE_RANGES_ADJACENT} == "non_overlapping"))',
     f"(~(doi_1 == '' | doi_2 == '') & {DOI} < 0.8 & ~({non_contradicting(AUTHOR, TITLE, YEAR, CONTAINER_TITLE, VOLUME, NUMBER, PAGES)}))",
-    f"({mismatch(VOLUME, NUMBER, PAGES)})",
+    # f"({mismatch(VOLUME, NUMBER, PAGES)})",
+    f"({mismatch(VOLUME, NUMBER, PAGES)}"
+    f" & ~({au095_ti09_ct075}"
+    f"     & ((volume_1 != '' & volume_2 == '') | (volume_2 != '' & volume_1 == ''))"
+    f"     & (pages_1.str.match('^1[-–]') | pages_2.str.match('^1[-–]'))"
+    f"   )"
+    f")",
     # Editorials: minor differences in volume/number/pages can be meaningful
     f'(title_1.str.contains("editor") & title_1.str.len() < 60 & ( {mismatch(VOLUME)} | {mismatch(NUMBER)} | {mismatch(PAGES)}))',
     # Journal vs. conference/workshop
@@ -110,4 +124,14 @@ def both_entrytypes(entrytype: str) -> str:
     f' ~({CONTAINER_TITLE}_2.str.contains("conf") | {CONTAINER_TITLE}_2.str.contains("work") | {CONTAINER_TITLE}_2.str.contains("proc")) ) & '
     f' ( ({CONTAINER_TITLE}_1.str.contains("conf") | {CONTAINER_TITLE}_1.str.contains("work") | {CONTAINER_TITLE}_1.str.contains("proc")) & '
     f'  ~{CONTAINER_TITLE}_1.str.contains("j") ))',
+    # Inproceedings: more sensitive to year mismatches
+    f'({both_entrytypes("inproceedings")} & {mismatch(YEAR)})',
+    # TODO : we may need to pre-compute this:
+    # https://chatgpt.com/c/695554d0-a548-8332-9921-ec28332246fd
+    # NEW: container title appears inside (either) title => treat as non-duplicate with anything else
+    # (e.g., "Communications of the Association for Information Systems ... Volume 52 Paper 41")
+    # f'(container_title_1 != "" & title_1 != "" & '
+    # f' title_1.str.contains(container_title_1.str.lower(), na=False))'
+    # f' | (container_title_2 != "" & title_2 != "" & '
+    # f' title_2.str.contains(container_title_2.str.lower(), na=False))',
 ]
@@ -42,6 +42,13 @@ def remove_erratum_suffix(title: str) -> str:
     return title
 
 
+def remove_authors_personal_copy(title: str) -> str:
+    # Remove "Author's personal copy" at start or end (case-insensitive),
+    # tolerating surrounding punctuation/whitespace.
+    pattern = r"^\s*author s personal copy[\s\-\–—:;,.]*|[\s\-\–—:;,.]*author s personal copy\s*$"
+    return re.sub(pattern, "", title).strip()
+
+
 # flake8: noqa: E501
 # pylint: disable=line-too-long
 def prep_title(title_array: np.array) -> np.array:
@@ -175,8 +182,16 @@ def prep_title(title_array: np.array) -> np.array:
         ]
     )
 
+    # Remove "Author's personal copy" at beginning/end
+    title_array = np.array(
+        [remove_authors_personal_copy(title) for title in title_array]
+    )
+
     # Replace multiple spaces with a single space
     title_array = np.array(
         [re.sub(r"\s+", " ", title).rstrip().lstrip() for title in title_array]
     )
     return title_array
+
+
+# TODO : organize more efficiently, see: https://chatgpt.com/c/6967a4de-d35c-8333-8a3b-8e5f8ea3329c
@@ -126,7 +126,7 @@ def sim_title(title_1: str, title_2: str, debug: bool = False) -> float:
     t1 = str(title_1)
     t2 = str(title_2)
 
-    if t1 == "" and t2 == "":
+    if t1 in ["", "book title"] or t2 in ["", "book title"]:
         return 0.0
 
     if t1.replace(" ", "") == t2.replace(" ", "") and t1.replace(" ", "") != "":
@@ -140,6 +140,10 @@ def sim_title(title_1: str, title_2: str, debug: bool = False) -> float:
                 "comment",
                 "response",
                 "reply",
+                "update",
+                "forum",
+                "proposed",
+                "talk",
             ]
         ]
     ):
@@ -314,6 +318,40 @@ def sim_volume(v1_str: str, v2_str: str) -> float:
     return 0.0
 
 
+# def _norm_abstract(text: str) -> str:
+#     text = "" if text is None else str(text)
+
+#     # normalize common patterns
+#     text = re.sub(r"([a-z])\s+(\d)", r"\1\2", text)
+#     text = re.sub(r"([a-z])\s+([a-z])", r"\1\2", text)
+
+#     # remove punctuation (keep letters/numbers/spaces)
+#     text = re.sub(r"[^a-z0-9\s]", " ", text)
+
+#     # collapse whitespace
+#     text = re.sub(r"\s+", " ", text).strip()
+#     return text
+
+
+# def sim_abstract(abstract_1: str, abstract_2: str) -> float:
+#     a1 = _norm_abstract(abstract_1)
+#     a2 = _norm_abstract(abstract_2)
+
+#     if not a1 or not a2:
+#         return 0.0
+
+#     # If one is essentially a prefix/subsequence of the other (truncated abstract),
+#     # partial_ratio will capture it much better than ratio.
+#     s_ratio = fuzz.ratio(a1, a2) / 100.0
+#     s_partial = fuzz.partial_ratio(a1, a2) / 100.0
+
+#     # token_set helps when words are same but order/noise differs
+#     s_token = fuzz.token_set_ratio(a1, a2) / 100.0
+
+#     # take the best signal; you can also blend (see below)
+#     return max(s_ratio, s_partial, s_token)
+
+
 def _norm_abstract(text: str) -> str:
     text = "" if text is None else str(text)
 
@@ -329,22 +367,32 @@ def _norm_abstract(text: str) -> str:
     return text
 
 
-def sim_abstract(abstract_1: str, abstract_2: str) -> float:
-    a1 = _norm_abstract(abstract_1)
-    a2 = _norm_abstract(abstract_2)
+def sim_abstract(a1: str, a2: str) -> float:
+    # a1 = _norm_abstract(a1)
+    # a2 = _norm_abstract(a2)
 
     if not a1 or not a2:
         return 0.0
 
-    # If one is essentially a prefix/subsequence of the other (truncated abstract),
-    # partial_ratio will capture it much better than ratio.
+    # fast path 1: exact match after normalization
+    if a1 == a2:
+        return 1.0
+
+    # fast path 2: truncated/prefix abstracts (cheap)
+    if len(a1) > 500 and len(a2) > 500:
+        tail = 100
+        if len(a1) > tail and len(a2) > tail:
+            if a1.startswith(a2[:-tail]) or a2.startswith(a1[:-tail]):
+                return 1.0
+
     s_ratio = fuzz.ratio(a1, a2) / 100.0
+    if s_ratio == 1.0:
+        return 1.0
     s_partial = fuzz.partial_ratio(a1, a2) / 100.0
-
-    # token_set helps when words are same but order/noise differs
+    if s_partial == 1.0:
+        return 1.0
     s_token = fuzz.token_set_ratio(a1, a2) / 100.0
 
-    # take the best signal; you can also blend (see below)
     return max(s_ratio, s_partial, s_token)
Original file line number	Diff line number	Diff line change
`@@ -132,6 +132,8 @@ def __get_maybe_pairs(pairs: pd.DataFrame, true_pairs: pd.DataFrame) -> pd.DataF`
`132`	`132`	`)`
`133`	`133`	`]`
`134`	`134`
	`135`	`+ # TODO: drop cases from maybe where one contains "part 1", the other "part 2"`
	`136`	`+`
`135`	`137`	`# Add a label column to each dataframe`
`136`	`138`	`maybe_pairs[DUPLICATE_LABEL] = MAYBE`
`137`	`139`	`# Select the ID_1, SEARCH_SET_1 and ID_2, SEARCH_SET_2 fields and the new label column`