From be1dd424c1ce620c1db67e448cfdd797780d1c56 Mon Sep 17 00:00:00 2001 From: Gerit Wagner Date: Mon, 26 Jan 2026 11:59:39 +0100 Subject: [PATCH 1/2] code and test case --- bib_dedupe/prep_schema.py | 101 ++++++++++++++++++++++++++++++++++++++ tests/test_cases.json | 23 +++++++++ 2 files changed, 124 insertions(+) diff --git a/bib_dedupe/prep_schema.py b/bib_dedupe/prep_schema.py index f7e4101..1cf977c 100644 --- a/bib_dedupe/prep_schema.py +++ b/bib_dedupe/prep_schema.py @@ -82,6 +82,91 @@ def _normalize_supplement(token: str) -> str: return t.strip() +def _norm_loose(text: str) -> str: + """Lowercase + remove all non-alphanumerics (incl. spaces) for loose comparison.""" + if not text: + return "" + return re.sub(r"[^a-z0-9]+", "", text.lower()) + + +def _looks_like_journal_only_title(title: str, journal: str) -> bool: + """ + True if title is essentially the journal name (maybe repeated), + optionally with Volume/Issue/Paper numbers. + """ + if not title or not journal: + return False + + t = title.strip() + j = journal.strip() + if not t or not j: + return False + + # fast path: exact-ish match after loose normalization + j_norm = _norm_loose(j) + if not j_norm: + return False + + # Strip common trailing "metadata-like" suffixes from title first + # e.g., "..., Volume 52 Paper 45", "Vol 52 No 1", "(52) 45", "52 Paper 45" + t_wo_meta = re.sub( + r"""(?ix) + (?:\bvolume\b|\bvol\.?\b|\bissue\b|\bno\.?\b|\bnumber\b|\bpaper\b|\bart\.?\b)? + [\s:,\-]*\(?\s*\d+\s*\)? # a number, optionally parenthesized + (?:[\s:,\-]*(?:\bpaper\b|\bart\.?\b)?[\s:,\-]*\d+)? # optional "paper 45" + (?:[\s:,\-]*\(?\s*\d+\s*\)?)? # optional extra number group + \s*$ + """, + "", + t, + ).strip() + + # Remove obvious duplicate journal repetitions inside the title + # by collapsing repeated occurrences of the journal string (case-insensitive). + # We'll do this loosely by repeatedly removing the journal token sequence. + base = t_wo_meta + # If journal is very short, avoid aggressive stripping + if len(j_norm) < 8: + return False + + # Build a tolerant regex for the journal words (allow variable spaces/punct) + # Example: "Communications of the Association for Information Systems" + journal_words = [w for w in re.split(r"\s+", j) if w] + if not journal_words: + return False + journal_pat = r"(?i)" + r"[\W_]*".join(map(re.escape, journal_words)) + + # Remove one-or-more occurrences of the journal phrase from the title + # Build a tolerant regex for the journal words (allow variable spaces/punct) + journal_words = [w for w in re.split(r"\s+", j) if w] + journal_pat = r"[\W_]*".join(map(re.escape, journal_words)) + + # Remove one-or-more occurrences of the journal phrase from the title + stripped = re.sub(rf"(?:{journal_pat})+", "", base, flags=re.IGNORECASE).strip() + + # After stripping journal phrase(s) and trailing meta, title should be empty + # (or just punctuation/numbers) + stripped_norm = re.sub(r"[^a-z0-9]+", "", stripped.lower()) + + # Allow remaining digits only (e.g., "52paper45" already removed, but be safe) + if stripped_norm == "": + return True + if stripped_norm.isdigit(): + return True + + # Also accept if what's left is only "volume"/"paper"/"issue" tokens (rare) + if re.fullmatch( + r"(?i)\W*(volume|vol|issue|no|number|paper|art|article)\W*", stripped + ): + return True + + # Finally: if the meta-stripped title is basically the journal name repeated + if _norm_loose(base) == j_norm or _norm_loose(base) == (j_norm * 2): + return True + + return False + + def fix_schema_misalignments(split_df: pd.DataFrame) -> None: """ Fix common schema misalignments where volume/number/pages contain mixed content. @@ -213,4 +298,20 @@ def s(col: str) -> pd.Series: col, ] = "" + # 8) Remove titles that are effectively just the journal name (possibly repeated) + mask_drop_title = split_df.apply( + lambda r: _looks_like_journal_only_title( + str(r.get("title", "")).strip(), + str(r.get("journal", "")).strip(), + ), + axis=1, + ) + + if mask_drop_title.any(): + split_df.loc[mask_drop_title, "title"] = "" + + # final cleanup + split_df["title"] = s("title") + split_df["journal"] = s("journal") + return diff --git a/tests/test_cases.json b/tests/test_cases.json index 50ce82d..fb352aa 100644 --- a/tests/test_cases.json +++ b/tests/test_cases.json @@ -645,6 +645,29 @@ "doi": "10.1186/s13756-014-0041-4" }, "expected_duplicate": true + }, + { + "id": "hu_moody_galletta_2023_title_is_journal_repeated_should_be_removed", + "note": "Title contains only repeated journal name plus 'Volume 52 Paper 41' boilerplate.", + "record_a": { + "ENTRYTYPE": "article", + "ID": "RaoMcnaughtonVermaUNKNOWN", + "author": "Rao, Lila and Mcnaughton, Maurice and Verma, Sameer", + "year": "2023", + "journal": "Communications of the Association for Information Systems", + "title": "Communications of the Association for Information Systems Communications of the Association for Information Systems Volume 52 Paper 45", + "volume": "52" + }, + "record_b": { + "ENTRYTYPE": "article", + "ID": "HuMoodyGalletta2023", + "author": "Hu, Han-Fen and Moody, Gregory D and Galletta, Dennis F", + "year": "2023", + "journal": "Communications of the Association for Information Systems", + "title": "Communications of the Association for Information Systems Communications of the Association for Information Systems Volume 52 Paper 41", + "volume": "52" + }, + "expected_duplicate": false } ] From 2fb2ff1ce3b0481852e3f8b5cd8d0d51b1785e42 Mon Sep 17 00:00:00 2001 From: Gerit Wagner Date: Mon, 26 Jan 2026 12:02:20 +0100 Subject: [PATCH 2/2] fix --- bib_dedupe/prep_schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bib_dedupe/prep_schema.py b/bib_dedupe/prep_schema.py index 1cf977c..cb05b26 100644 --- a/bib_dedupe/prep_schema.py +++ b/bib_dedupe/prep_schema.py @@ -181,7 +181,7 @@ def fix_schema_misalignments(split_df: pd.DataFrame) -> None: return # ensure columns exist - for col in ("volume", "number", "pages", "year"): + for col in ("title", "journal", "volume", "number", "pages", "year"): if col not in split_df.columns: split_df[col] = ""