-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathprep_schema.py
More file actions
317 lines (260 loc) · 10.8 KB
/
prep_schema.py
File metadata and controls
317 lines (260 loc) · 10.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
#! /usr/bin/env python
"""Preparation of misaligned schemata"""
import re
import pandas as pd
_MONTH_TOKENS = {
"jan",
"january",
"feb",
"february",
"mar",
"march",
"apr",
"april",
"may",
"jun",
"june",
"jul",
"july",
"aug",
"august",
"sep",
"sept",
"september",
"oct",
"october",
"nov",
"november",
"dec",
"december",
}
def _strip_no_pagination(text: str) -> str:
"""Remove '(no pagination)' fragments without setting pages."""
if not text:
return ""
# remove any occurrence like "(no pagination)" with flexible whitespace/case
text = re.sub(r"\(\s*no\s+pagination\s*\)", "", text, flags=re.IGNORECASE)
return re.sub(r"\s+", " ", text).strip()
def _is_monthish(token: str) -> bool:
"""Return True if token looks like a month/season/date label (to be ignored)."""
if not token:
return False
t = token.strip().lower()
# remove punctuation for month detection
t_clean = re.sub(r"[^a-z0-9 ]", " ", t)
parts = [p for p in t_clean.split() if p]
if not parts:
return False
# if any part is a month token -> treat as monthish
if any(p in _MONTH_TOKENS for p in parts):
return True
# patterns like "3 aug" (month abbrev without parentheses)
if len(parts) == 2 and parts[1] in _MONTH_TOKENS and parts[0].isdigit():
return True
return False
def _normalize_supplement(token: str) -> str:
"""Normalize common supplement formats lightly (keep informative text)."""
if not token:
return ""
t = token.strip()
t = re.sub(r"\s+", " ", t)
# SUPPL. 1 -> SUPPL.1 ; SUPPL.1 -> SUPPL.1
t = re.sub(r"(?i)\bSUPPL\.?\s*(\d+)\b", r"SUPPL.\1", t)
# "Supplement3" -> "Supplement 3"
t = re.sub(r"(?i)\bSupplement\s*([0-9]+)\b", r"Supplement \1", t)
t = re.sub(r"(?i)\bSupplement([0-9]+)\b", r"Supplement \1", t)
# "SPEC.ISS 1" / "Spec.Iss 1" -> "Spec.Iss 1"
t = re.sub(r"(?i)\bSPEC\.?\s*ISS\.?\s*(\d+)\b", r"Spec.Iss \1", t)
return t.strip()
def _norm_loose(text: str) -> str:
"""Lowercase + remove all non-alphanumerics (incl. spaces) for loose comparison."""
if not text:
return ""
return re.sub(r"[^a-z0-9]+", "", text.lower())
def _looks_like_journal_only_title(title: str, journal: str) -> bool:
"""
True if title is essentially the journal name (maybe repeated),
optionally with Volume/Issue/Paper numbers.
"""
if not title or not journal:
return False
t = title.strip()
j = journal.strip()
if not t or not j:
return False
# fast path: exact-ish match after loose normalization
j_norm = _norm_loose(j)
if not j_norm:
return False
# Strip common trailing "metadata-like" suffixes from title first
# e.g., "..., Volume 52 Paper 45", "Vol 52 No 1", "(52) 45", "52 Paper 45"
t_wo_meta = re.sub(
r"""(?ix)
(?:\bvolume\b|\bvol\.?\b|\bissue\b|\bno\.?\b|\bnumber\b|\bpaper\b|\bart\.?\b)?
[\s:,\-]*\(?\s*\d+\s*\)? # a number, optionally parenthesized
(?:[\s:,\-]*(?:\bpaper\b|\bart\.?\b)?[\s:,\-]*\d+)? # optional "paper 45"
(?:[\s:,\-]*\(?\s*\d+\s*\)?)? # optional extra number group
\s*$
""",
"",
t,
).strip()
# Remove obvious duplicate journal repetitions inside the title
# by collapsing repeated occurrences of the journal string (case-insensitive).
# We'll do this loosely by repeatedly removing the journal token sequence.
base = t_wo_meta
# If journal is very short, avoid aggressive stripping
if len(j_norm) < 8:
return False
# Build a tolerant regex for the journal words (allow variable spaces/punct)
# Example: "Communications of the Association for Information Systems"
journal_words = [w for w in re.split(r"\s+", j) if w]
if not journal_words:
return False
journal_pat = r"(?i)" + r"[\W_]*".join(map(re.escape, journal_words))
# Remove one-or-more occurrences of the journal phrase from the title
# Build a tolerant regex for the journal words (allow variable spaces/punct)
journal_words = [w for w in re.split(r"\s+", j) if w]
journal_pat = r"[\W_]*".join(map(re.escape, journal_words))
# Remove one-or-more occurrences of the journal phrase from the title
stripped = re.sub(rf"(?:{journal_pat})+", "", base, flags=re.IGNORECASE).strip()
# After stripping journal phrase(s) and trailing meta, title should be empty
# (or just punctuation/numbers)
stripped_norm = re.sub(r"[^a-z0-9]+", "", stripped.lower())
# Allow remaining digits only (e.g., "52paper45" already removed, but be safe)
if stripped_norm == "":
return True
if stripped_norm.isdigit():
return True
# Also accept if what's left is only "volume"/"paper"/"issue" tokens (rare)
if re.fullmatch(
r"(?i)\W*(volume|vol|issue|no|number|paper|art|article)\W*", stripped
):
return True
# Finally: if the meta-stripped title is basically the journal name repeated
if _norm_loose(base) == j_norm or _norm_loose(base) == (j_norm * 2):
return True
return False
def fix_schema_misalignments(split_df: pd.DataFrame) -> None:
"""
Fix common schema misalignments where volume/number/pages contain mixed content.
Updated rules (per request):
- '(no pagination)' is removed wherever it appears, but pages MUST NOT be set.
- Month-like tokens (JAN, FEBRUARY 2012, '(7 JUL)', etc.) are removed/ignored.
- "Strange large issue" values are not treated specially (left as-is if parsed).
- Function mutates split_df in-place and returns None.
"""
if split_df.empty:
return
# ensure columns exist
for col in ("title", "journal", "volume", "number", "pages", "year"):
if col not in split_df.columns:
split_df[col] = ""
# helper to get safe string series
def s(col: str) -> pd.Series:
return split_df[col].fillna("").astype(str).str.strip()
# 1) strip '(no pagination)' everywhere (volume/number/pages)
split_df["volume"] = s("volume").map(_strip_no_pagination)
split_df["number"] = s("number").map(_strip_no_pagination)
split_df["pages"] = s("pages").map(_strip_no_pagination)
num = s("number")
pag = s("pages")
yr = s("year")
# 2) If pages is like "(1)" or "(4)" -> move into number if empty, clear pages
# Also handle "(1) (no pagination)" already stripped to "(1)" above.
m_pages_issue = pag.str.extract(r"^\(\s*(?P<iss>[^)]+?)\s*\)$")
mask_pages_issue = m_pages_issue["iss"].notna()
if mask_pages_issue.any():
issue_val = m_pages_issue["iss"].fillna("").astype(str).str.strip()
# ignore monthish issue labels
mask_set = mask_pages_issue & (num == "") & (~issue_val.map(_is_monthish))
split_df.loc[mask_set, "number"] = issue_val[mask_set].map(
_normalize_supplement
)
split_df.loc[mask_pages_issue, "pages"] = "" # clear pages (don't set to 1)
# refresh
num = s("number")
pag = s("pages")
# 3) Volume-only "(4)" -> issue without volume: set number if empty; clear volume
# Also handle "(7 JUL)" monthish -> drop
vol_now = s("volume")
m_only_paren = vol_now.str.extract(r"^\(\s*(?P<tok>[^)]+?)\s*\)$")
mask_only_paren = m_only_paren["tok"].notna()
if mask_only_paren.any():
tok = m_only_paren["tok"].fillna("").astype(str).str.strip()
mask_set = mask_only_paren & (num == "") & (~tok.map(_is_monthish))
split_df.loc[mask_set, "number"] = tok[mask_set].map(_normalize_supplement)
# always clear volume if it was only "(...)" (monthish or not)
split_df.loc[mask_only_paren, "volume"] = ""
num = s("number")
# 4) Year stored where volume should be: "2017 (10)" or "2017"
# If year field empty, copy year. If parentheses after year look like issue, move to number.
vol_now = s("volume")
m_year = vol_now.str.extract(r"^(?P<year>\d{4})(?:\s*\(\s*(?P<iss>[^)]+?)\s*\))?$")
mask_year = m_year["year"].notna()
if mask_year.any():
yval = m_year["year"].fillna("").astype(str).str.strip()
iss = m_year["iss"].fillna("").astype(str).str.strip()
# set year if empty
mask_set_year = mask_year & (yr == "") & (yval != "")
split_df.loc[mask_set_year, "year"] = yval[mask_set_year]
# set number from iss if number empty and iss exists and not monthish
mask_set_num = mask_year & (num == "") & (iss != "") & (~iss.map(_is_monthish))
split_df.loc[mask_set_num, "number"] = iss[mask_set_num].map(
_normalize_supplement
)
# clear volume (because it was a year)
split_df.loc[mask_year, "volume"] = ""
num = s("number")
yr = s("year")
# 5) Main pattern: "V (X)" where X may include nested parentheses like "2(2)"
vol_now = s("volume")
# OLD (breaks on nested parens):
# m_vol_issue = vol_now.str.extract(r"^(?P<vol>[A-Za-z0-9]+)\s*\(\s*(?P<iss>[^)]+?)\s*\)$")
# NEW (captures everything up to the last ')'):
m_vol_issue = vol_now.str.extract(
r"^(?P<vol>[A-Za-z0-9]+)\s*\(\s*(?P<iss>.+)\s*\)$"
)
mask_vol_issue = m_vol_issue["vol"].notna()
if mask_vol_issue.any():
v = m_vol_issue["vol"].fillna("").astype(str).str.strip()
iss = m_vol_issue["iss"].fillna("").astype(str).str.strip()
# always set volume to the leading part
split_df.loc[mask_vol_issue, "volume"] = v[mask_vol_issue]
# set number only if empty and issue isn't monthish
mask_set_num = (
mask_vol_issue & (num == "") & (iss != "") & (~iss.map(_is_monthish))
)
split_df.loc[mask_set_num, "number"] = iss[mask_set_num].map(
_normalize_supplement
)
num = s("number")
# 6) If any residual "(no pagination)" became empty-only markers, ensure cleaned
split_df["volume"] = s("volume")
split_df["number"] = s("number")
split_df["pages"] = s("pages")
# 7) If a field is now literally "no pagination" (without parens), drop it too (rare)
for col in ("volume", "number", "pages"):
split_df.loc[
split_df[col]
.fillna("")
.astype(str)
.str.strip()
.str.lower()
.eq("no pagination"),
col,
] = ""
# 8) Remove titles that are effectively just the journal name (possibly repeated)
mask_drop_title = split_df.apply(
lambda r: _looks_like_journal_only_title(
str(r.get("title", "")).strip(),
str(r.get("journal", "")).strip(),
),
axis=1,
)
if mask_drop_title.any():
split_df.loc[mask_drop_title, "title"] = ""
# final cleanup
split_df["title"] = s("title")
split_df["journal"] = s("journal")
return