code and test case

Gerit Wagner · Gerit Wagner · commit be1dd424c1ce · 2026-01-26T11:59:39.000+01:00
diff --git a/bib_dedupe/prep_schema.py b/bib_dedupe/prep_schema.py
@@ -82,6 +82,91 @@ def _normalize_supplement(token: str) -> str:
     return t.strip()
 
 
+def _norm_loose(text: str) -> str:
+    """Lowercase + remove all non-alphanumerics (incl. spaces) for loose comparison."""
+    if not text:
+        return ""
+    return re.sub(r"[^a-z0-9]+", "", text.lower())
+
+
+def _looks_like_journal_only_title(title: str, journal: str) -> bool:
+    """
+    True if title is essentially the journal name (maybe repeated),
+    optionally with Volume/Issue/Paper numbers.
+    """
+    if not title or not journal:
+        return False
+
+    t = title.strip()
+    j = journal.strip()
+    if not t or not j:
+        return False
+
+    # fast path: exact-ish match after loose normalization
+    j_norm = _norm_loose(j)
+    if not j_norm:
+        return False
+
+    # Strip common trailing "metadata-like" suffixes from title first
+    # e.g., "..., Volume 52 Paper 45", "Vol 52 No 1", "(52) 45", "52 Paper 45"
+    t_wo_meta = re.sub(
+        r"""(?ix)
+        (?:\bvolume\b|\bvol\.?\b|\bissue\b|\bno\.?\b|\bnumber\b|\bpaper\b|\bart\.?\b)?
+        [\s:,\-]*\(?\s*\d+\s*\)?      # a number, optionally parenthesized
+        (?:[\s:,\-]*(?:\bpaper\b|\bart\.?\b)?[\s:,\-]*\d+)?  # optional "paper 45"
+        (?:[\s:,\-]*\(?\s*\d+\s*\)?)? # optional extra number group
+        \s*$
+        """,
+        "",
+        t,
+    ).strip()
+
+    # Remove obvious duplicate journal repetitions inside the title
+    # by collapsing repeated occurrences of the journal string (case-insensitive).
+    # We'll do this loosely by repeatedly removing the journal token sequence.
+    base = t_wo_meta
+    # If journal is very short, avoid aggressive stripping
+    if len(j_norm) < 8:
+        return False
+
+    # Build a tolerant regex for the journal words (allow variable spaces/punct)
+    # Example: "Communications of the Association for Information Systems"
+    journal_words = [w for w in re.split(r"\s+", j) if w]
+    if not journal_words:
+        return False
+    journal_pat = r"(?i)" + r"[\W_]*".join(map(re.escape, journal_words))
+
+    # Remove one-or-more occurrences of the journal phrase from the title
+    # Build a tolerant regex for the journal words (allow variable spaces/punct)
+    journal_words = [w for w in re.split(r"\s+", j) if w]
+    journal_pat = r"[\W_]*".join(map(re.escape, journal_words))
+
+    # Remove one-or-more occurrences of the journal phrase from the title
+    stripped = re.sub(rf"(?:{journal_pat})+", "", base, flags=re.IGNORECASE).strip()
+
+    # After stripping journal phrase(s) and trailing meta, title should be empty
+    # (or just punctuation/numbers)
+    stripped_norm = re.sub(r"[^a-z0-9]+", "", stripped.lower())
+
+    # Allow remaining digits only (e.g., "52paper45" already removed, but be safe)
+    if stripped_norm == "":
+        return True
+    if stripped_norm.isdigit():
+        return True
+
+    # Also accept if what's left is only "volume"/"paper"/"issue" tokens (rare)
+    if re.fullmatch(
+        r"(?i)\W*(volume|vol|issue|no|number|paper|art|article)\W*", stripped
+    ):
+        return True
+
+    # Finally: if the meta-stripped title is basically the journal name repeated
+    if _norm_loose(base) == j_norm or _norm_loose(base) == (j_norm * 2):
+        return True
+
+    return False
+
+
 def fix_schema_misalignments(split_df: pd.DataFrame) -> None:
     """
     Fix common schema misalignments where volume/number/pages contain mixed content.
@@ -213,4 +298,20 @@ def s(col: str) -> pd.Series:
             col,
         ] = ""
 
+    # 8) Remove titles that are effectively just the journal name (possibly repeated)
+    mask_drop_title = split_df.apply(
+        lambda r: _looks_like_journal_only_title(
+            str(r.get("title", "")).strip(),
+            str(r.get("journal", "")).strip(),
+        ),
+        axis=1,
+    )
+
+    if mask_drop_title.any():
+        split_df.loc[mask_drop_title, "title"] = ""
+
+    # final cleanup
+    split_df["title"] = s("title")
+    split_df["journal"] = s("journal")
+
     return
diff --git a/tests/test_cases.json b/tests/test_cases.json
@@ -645,6 +645,29 @@
         "doi": "10.1186/s13756-014-0041-4"
       },
       "expected_duplicate": true
+    },
+    {
+      "id": "hu_moody_galletta_2023_title_is_journal_repeated_should_be_removed",
+      "note": "Title contains only repeated journal name plus 'Volume 52 Paper 41' boilerplate.",
+      "record_a": {
+        "ENTRYTYPE": "article",
+        "ID": "RaoMcnaughtonVermaUNKNOWN",
+        "author": "Rao, Lila and Mcnaughton, Maurice and Verma, Sameer",
+        "year": "2023",
+        "journal": "Communications of the Association for Information Systems",
+        "title": "Communications of the Association for Information Systems Communications of the Association for Information Systems Volume 52 Paper 45",
+        "volume": "52"
+      },
+      "record_b": {
+        "ENTRYTYPE": "article",
+        "ID": "HuMoodyGalletta2023",
+        "author": "Hu, Han-Fen and Moody, Gregory D and Galletta, Dennis F",
+        "year": "2023",
+        "journal": "Communications of the Association for Information Systems",
+        "title": "Communications of the Association for Information Systems Communications of the Association for Information Systems Volume 52 Paper 41",
+        "volume": "52"
+      },
+      "expected_duplicate": false
     }
 
   ]