Skip to content

Commit be1dd42

Browse files
author
Gerit Wagner
committed
code and test case
1 parent 1e2e168 commit be1dd42

File tree

2 files changed

+124
-0
lines changed

2 files changed

+124
-0
lines changed

bib_dedupe/prep_schema.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,91 @@ def _normalize_supplement(token: str) -> str:
8282
return t.strip()
8383

8484

85+
def _norm_loose(text: str) -> str:
86+
"""Lowercase + remove all non-alphanumerics (incl. spaces) for loose comparison."""
87+
if not text:
88+
return ""
89+
return re.sub(r"[^a-z0-9]+", "", text.lower())
90+
91+
92+
def _looks_like_journal_only_title(title: str, journal: str) -> bool:
93+
"""
94+
True if title is essentially the journal name (maybe repeated),
95+
optionally with Volume/Issue/Paper numbers.
96+
"""
97+
if not title or not journal:
98+
return False
99+
100+
t = title.strip()
101+
j = journal.strip()
102+
if not t or not j:
103+
return False
104+
105+
# fast path: exact-ish match after loose normalization
106+
j_norm = _norm_loose(j)
107+
if not j_norm:
108+
return False
109+
110+
# Strip common trailing "metadata-like" suffixes from title first
111+
# e.g., "..., Volume 52 Paper 45", "Vol 52 No 1", "(52) 45", "52 Paper 45"
112+
t_wo_meta = re.sub(
113+
r"""(?ix)
114+
(?:\bvolume\b|\bvol\.?\b|\bissue\b|\bno\.?\b|\bnumber\b|\bpaper\b|\bart\.?\b)?
115+
[\s:,\-]*\(?\s*\d+\s*\)? # a number, optionally parenthesized
116+
(?:[\s:,\-]*(?:\bpaper\b|\bart\.?\b)?[\s:,\-]*\d+)? # optional "paper 45"
117+
(?:[\s:,\-]*\(?\s*\d+\s*\)?)? # optional extra number group
118+
\s*$
119+
""",
120+
"",
121+
t,
122+
).strip()
123+
124+
# Remove obvious duplicate journal repetitions inside the title
125+
# by collapsing repeated occurrences of the journal string (case-insensitive).
126+
# We'll do this loosely by repeatedly removing the journal token sequence.
127+
base = t_wo_meta
128+
# If journal is very short, avoid aggressive stripping
129+
if len(j_norm) < 8:
130+
return False
131+
132+
# Build a tolerant regex for the journal words (allow variable spaces/punct)
133+
# Example: "Communications of the Association for Information Systems"
134+
journal_words = [w for w in re.split(r"\s+", j) if w]
135+
if not journal_words:
136+
return False
137+
journal_pat = r"(?i)" + r"[\W_]*".join(map(re.escape, journal_words))
138+
139+
# Remove one-or-more occurrences of the journal phrase from the title
140+
# Build a tolerant regex for the journal words (allow variable spaces/punct)
141+
journal_words = [w for w in re.split(r"\s+", j) if w]
142+
journal_pat = r"[\W_]*".join(map(re.escape, journal_words))
143+
144+
# Remove one-or-more occurrences of the journal phrase from the title
145+
stripped = re.sub(rf"(?:{journal_pat})+", "", base, flags=re.IGNORECASE).strip()
146+
147+
# After stripping journal phrase(s) and trailing meta, title should be empty
148+
# (or just punctuation/numbers)
149+
stripped_norm = re.sub(r"[^a-z0-9]+", "", stripped.lower())
150+
151+
# Allow remaining digits only (e.g., "52paper45" already removed, but be safe)
152+
if stripped_norm == "":
153+
return True
154+
if stripped_norm.isdigit():
155+
return True
156+
157+
# Also accept if what's left is only "volume"/"paper"/"issue" tokens (rare)
158+
if re.fullmatch(
159+
r"(?i)\W*(volume|vol|issue|no|number|paper|art|article)\W*", stripped
160+
):
161+
return True
162+
163+
# Finally: if the meta-stripped title is basically the journal name repeated
164+
if _norm_loose(base) == j_norm or _norm_loose(base) == (j_norm * 2):
165+
return True
166+
167+
return False
168+
169+
85170
def fix_schema_misalignments(split_df: pd.DataFrame) -> None:
86171
"""
87172
Fix common schema misalignments where volume/number/pages contain mixed content.
@@ -213,4 +298,20 @@ def s(col: str) -> pd.Series:
213298
col,
214299
] = ""
215300

301+
# 8) Remove titles that are effectively just the journal name (possibly repeated)
302+
mask_drop_title = split_df.apply(
303+
lambda r: _looks_like_journal_only_title(
304+
str(r.get("title", "")).strip(),
305+
str(r.get("journal", "")).strip(),
306+
),
307+
axis=1,
308+
)
309+
310+
if mask_drop_title.any():
311+
split_df.loc[mask_drop_title, "title"] = ""
312+
313+
# final cleanup
314+
split_df["title"] = s("title")
315+
split_df["journal"] = s("journal")
316+
216317
return

tests/test_cases.json

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -645,6 +645,29 @@
645645
"doi": "10.1186/s13756-014-0041-4"
646646
},
647647
"expected_duplicate": true
648+
},
649+
{
650+
"id": "hu_moody_galletta_2023_title_is_journal_repeated_should_be_removed",
651+
"note": "Title contains only repeated journal name plus 'Volume 52 Paper 41' boilerplate.",
652+
"record_a": {
653+
"ENTRYTYPE": "article",
654+
"ID": "RaoMcnaughtonVermaUNKNOWN",
655+
"author": "Rao, Lila and Mcnaughton, Maurice and Verma, Sameer",
656+
"year": "2023",
657+
"journal": "Communications of the Association for Information Systems",
658+
"title": "Communications of the Association for Information Systems Communications of the Association for Information Systems Volume 52 Paper 45",
659+
"volume": "52"
660+
},
661+
"record_b": {
662+
"ENTRYTYPE": "article",
663+
"ID": "HuMoodyGalletta2023",
664+
"author": "Hu, Han-Fen and Moody, Gregory D and Galletta, Dennis F",
665+
"year": "2023",
666+
"journal": "Communications of the Association for Information Systems",
667+
"title": "Communications of the Association for Information Systems Communications of the Association for Information Systems Volume 52 Paper 41",
668+
"volume": "52"
669+
},
670+
"expected_duplicate": false
648671
}
649672

650673
]

0 commit comments

Comments
 (0)