@@ -82,6 +82,91 @@ def _normalize_supplement(token: str) -> str:
8282 return t .strip ()
8383
8484
85+ def _norm_loose (text : str ) -> str :
86+ """Lowercase + remove all non-alphanumerics (incl. spaces) for loose comparison."""
87+ if not text :
88+ return ""
89+ return re .sub (r"[^a-z0-9]+" , "" , text .lower ())
90+
91+
92+ def _looks_like_journal_only_title (title : str , journal : str ) -> bool :
93+ """
94+ True if title is essentially the journal name (maybe repeated),
95+ optionally with Volume/Issue/Paper numbers.
96+ """
97+ if not title or not journal :
98+ return False
99+
100+ t = title .strip ()
101+ j = journal .strip ()
102+ if not t or not j :
103+ return False
104+
105+ # fast path: exact-ish match after loose normalization
106+ j_norm = _norm_loose (j )
107+ if not j_norm :
108+ return False
109+
110+ # Strip common trailing "metadata-like" suffixes from title first
111+ # e.g., "..., Volume 52 Paper 45", "Vol 52 No 1", "(52) 45", "52 Paper 45"
112+ t_wo_meta = re .sub (
113+ r"""(?ix)
114+ (?:\bvolume\b|\bvol\.?\b|\bissue\b|\bno\.?\b|\bnumber\b|\bpaper\b|\bart\.?\b)?
115+ [\s:,\-]*\(?\s*\d+\s*\)? # a number, optionally parenthesized
116+ (?:[\s:,\-]*(?:\bpaper\b|\bart\.?\b)?[\s:,\-]*\d+)? # optional "paper 45"
117+ (?:[\s:,\-]*\(?\s*\d+\s*\)?)? # optional extra number group
118+ \s*$
119+ """ ,
120+ "" ,
121+ t ,
122+ ).strip ()
123+
124+ # Remove obvious duplicate journal repetitions inside the title
125+ # by collapsing repeated occurrences of the journal string (case-insensitive).
126+ # We'll do this loosely by repeatedly removing the journal token sequence.
127+ base = t_wo_meta
128+ # If journal is very short, avoid aggressive stripping
129+ if len (j_norm ) < 8 :
130+ return False
131+
132+ # Build a tolerant regex for the journal words (allow variable spaces/punct)
133+ # Example: "Communications of the Association for Information Systems"
134+ journal_words = [w for w in re .split (r"\s+" , j ) if w ]
135+ if not journal_words :
136+ return False
137+ journal_pat = r"(?i)" + r"[\W_]*" .join (map (re .escape , journal_words ))
138+
139+ # Remove one-or-more occurrences of the journal phrase from the title
140+ # Build a tolerant regex for the journal words (allow variable spaces/punct)
141+ journal_words = [w for w in re .split (r"\s+" , j ) if w ]
142+ journal_pat = r"[\W_]*" .join (map (re .escape , journal_words ))
143+
144+ # Remove one-or-more occurrences of the journal phrase from the title
145+ stripped = re .sub (rf"(?:{ journal_pat } )+" , "" , base , flags = re .IGNORECASE ).strip ()
146+
147+ # After stripping journal phrase(s) and trailing meta, title should be empty
148+ # (or just punctuation/numbers)
149+ stripped_norm = re .sub (r"[^a-z0-9]+" , "" , stripped .lower ())
150+
151+ # Allow remaining digits only (e.g., "52paper45" already removed, but be safe)
152+ if stripped_norm == "" :
153+ return True
154+ if stripped_norm .isdigit ():
155+ return True
156+
157+ # Also accept if what's left is only "volume"/"paper"/"issue" tokens (rare)
158+ if re .fullmatch (
159+ r"(?i)\W*(volume|vol|issue|no|number|paper|art|article)\W*" , stripped
160+ ):
161+ return True
162+
163+ # Finally: if the meta-stripped title is basically the journal name repeated
164+ if _norm_loose (base ) == j_norm or _norm_loose (base ) == (j_norm * 2 ):
165+ return True
166+
167+ return False
168+
169+
85170def fix_schema_misalignments (split_df : pd .DataFrame ) -> None :
86171 """
87172 Fix common schema misalignments where volume/number/pages contain mixed content.
@@ -213,4 +298,20 @@ def s(col: str) -> pd.Series:
213298 col ,
214299 ] = ""
215300
301+ # 8) Remove titles that are effectively just the journal name (possibly repeated)
302+ mask_drop_title = split_df .apply (
303+ lambda r : _looks_like_journal_only_title (
304+ str (r .get ("title" , "" )).strip (),
305+ str (r .get ("journal" , "" )).strip (),
306+ ),
307+ axis = 1 ,
308+ )
309+
310+ if mask_drop_title .any ():
311+ split_df .loc [mask_drop_title , "title" ] = ""
312+
313+ # final cleanup
314+ split_df ["title" ] = s ("title" )
315+ split_df ["journal" ] = s ("journal" )
316+
216317 return
0 commit comments