Skip to content

Commit ea009a9

Browse files
author
Gerit Wagner
committed
add test case and code
1 parent 220fdd3 commit ea009a9

File tree

4 files changed

+423
-0
lines changed

4 files changed

+423
-0
lines changed

bib_dedupe/prep.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,12 @@
3737
from bib_dedupe.prep_doi import prep_doi
3838
from bib_dedupe.prep_number import prep_number
3939
from bib_dedupe.prep_pages import prep_pages
40+
from bib_dedupe.prep_schema import fix_schema_misalignments
4041
from bib_dedupe.prep_title import prep_title
4142
from bib_dedupe.prep_volume import prep_volume
4243
from bib_dedupe.prep_year import prep_year
4344

45+
4446
pd.set_option("future.no_silent_downcasting", True)
4547

4648
REQUIRED_FIELDS = [ID, ENTRYTYPE, TITLE, AUTHOR, YEAR]
@@ -97,6 +99,8 @@ def prepare_df_split(split_df: pd.DataFrame) -> pd.DataFrame:
9799

98100
split_df["author_full"] = split_df[AUTHOR]
99101

102+
fix_schema_misalignments(split_df)
103+
100104
for field, function in function_mapping.items():
101105
split_df[field] = function(split_df[field].values) # type: ignore
102106

bib_dedupe/prep_schema.py

Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
#! /usr/bin/env python
2+
"""Preparation of misaligned schemata"""
3+
import re
4+
5+
import pandas as pd
6+
7+
8+
_MONTH_TOKENS = {
9+
"jan",
10+
"january",
11+
"feb",
12+
"february",
13+
"mar",
14+
"march",
15+
"apr",
16+
"april",
17+
"may",
18+
"jun",
19+
"june",
20+
"jul",
21+
"july",
22+
"aug",
23+
"august",
24+
"sep",
25+
"sept",
26+
"september",
27+
"oct",
28+
"october",
29+
"nov",
30+
"november",
31+
"dec",
32+
"december",
33+
}
34+
35+
36+
def _strip_no_pagination(text: str) -> str:
37+
"""Remove '(no pagination)' fragments without setting pages."""
38+
if not text:
39+
return ""
40+
# remove any occurrence like "(no pagination)" with flexible whitespace/case
41+
text = re.sub(r"\(\s*no\s+pagination\s*\)", "", text, flags=re.IGNORECASE)
42+
return re.sub(r"\s+", " ", text).strip()
43+
44+
45+
def _is_monthish(token: str) -> bool:
46+
"""Return True if token looks like a month/season/date label (to be ignored)."""
47+
if not token:
48+
return False
49+
t = token.strip().lower()
50+
# remove punctuation for month detection
51+
t_clean = re.sub(r"[^a-z0-9 ]", " ", t)
52+
parts = [p for p in t_clean.split() if p]
53+
if not parts:
54+
return False
55+
# if any part is a month token -> treat as monthish
56+
if any(p in _MONTH_TOKENS for p in parts):
57+
return True
58+
# patterns like "3 aug" (month abbrev without parentheses)
59+
if len(parts) == 2 and parts[1] in _MONTH_TOKENS and parts[0].isdigit():
60+
return True
61+
return False
62+
63+
64+
def _normalize_supplement(token: str) -> str:
65+
"""Normalize common supplement formats lightly (keep informative text)."""
66+
if not token:
67+
return ""
68+
69+
t = token.strip()
70+
t = re.sub(r"\s+", " ", t)
71+
72+
# SUPPL. 1 -> SUPPL.1 ; SUPPL.1 -> SUPPL.1
73+
t = re.sub(r"(?i)\bSUPPL\.?\s*(\d+)\b", r"SUPPL.\1", t)
74+
75+
# "Supplement3" -> "Supplement 3"
76+
t = re.sub(r"(?i)\bSupplement\s*([0-9]+)\b", r"Supplement \1", t)
77+
t = re.sub(r"(?i)\bSupplement([0-9]+)\b", r"Supplement \1", t)
78+
79+
# "SPEC.ISS 1" / "Spec.Iss 1" -> "Spec.Iss 1"
80+
t = re.sub(r"(?i)\bSPEC\.?\s*ISS\.?\s*(\d+)\b", r"Spec.Iss \1", t)
81+
82+
return t.strip()
83+
84+
85+
def fix_schema_misalignments(split_df: pd.DataFrame) -> None:
86+
"""
87+
Fix common schema misalignments where volume/number/pages contain mixed content.
88+
89+
Updated rules (per request):
90+
- '(no pagination)' is removed wherever it appears, but pages MUST NOT be set.
91+
- Month-like tokens (JAN, FEBRUARY 2012, '(7 JUL)', etc.) are removed/ignored.
92+
- "Strange large issue" values are not treated specially (left as-is if parsed).
93+
- Function mutates split_df in-place and returns None.
94+
"""
95+
if split_df.empty:
96+
return
97+
98+
# ensure columns exist
99+
for col in ("volume", "number", "pages", "year"):
100+
if col not in split_df.columns:
101+
split_df[col] = ""
102+
103+
# helper to get safe string series
104+
def s(col: str) -> pd.Series:
105+
return split_df[col].fillna("").astype(str).str.strip()
106+
107+
# 1) strip '(no pagination)' everywhere (volume/number/pages)
108+
split_df["volume"] = s("volume").map(_strip_no_pagination)
109+
split_df["number"] = s("number").map(_strip_no_pagination)
110+
split_df["pages"] = s("pages").map(_strip_no_pagination)
111+
112+
num = s("number")
113+
pag = s("pages")
114+
yr = s("year")
115+
116+
# 2) If pages is like "(1)" or "(4)" -> move into number if empty, clear pages
117+
# Also handle "(1) (no pagination)" already stripped to "(1)" above.
118+
m_pages_issue = pag.str.extract(r"^\(\s*(?P<iss>[^)]+?)\s*\)$")
119+
mask_pages_issue = m_pages_issue["iss"].notna()
120+
if mask_pages_issue.any():
121+
issue_val = m_pages_issue["iss"].fillna("").astype(str).str.strip()
122+
# ignore monthish issue labels
123+
mask_set = mask_pages_issue & (num == "") & (~issue_val.map(_is_monthish))
124+
split_df.loc[mask_set, "number"] = issue_val[mask_set].map(
125+
_normalize_supplement
126+
)
127+
split_df.loc[mask_pages_issue, "pages"] = "" # clear pages (don't set to 1)
128+
# refresh
129+
num = s("number")
130+
pag = s("pages")
131+
132+
# 3) Volume-only "(4)" -> issue without volume: set number if empty; clear volume
133+
# Also handle "(7 JUL)" monthish -> drop
134+
vol_now = s("volume")
135+
m_only_paren = vol_now.str.extract(r"^\(\s*(?P<tok>[^)]+?)\s*\)$")
136+
mask_only_paren = m_only_paren["tok"].notna()
137+
if mask_only_paren.any():
138+
tok = m_only_paren["tok"].fillna("").astype(str).str.strip()
139+
mask_set = mask_only_paren & (num == "") & (~tok.map(_is_monthish))
140+
split_df.loc[mask_set, "number"] = tok[mask_set].map(_normalize_supplement)
141+
# always clear volume if it was only "(...)" (monthish or not)
142+
split_df.loc[mask_only_paren, "volume"] = ""
143+
num = s("number")
144+
145+
# 4) Year stored where volume should be: "2017 (10)" or "2017"
146+
# If year field empty, copy year. If parentheses after year look like issue, move to number.
147+
vol_now = s("volume")
148+
m_year = vol_now.str.extract(r"^(?P<year>\d{4})(?:\s*\(\s*(?P<iss>[^)]+?)\s*\))?$")
149+
mask_year = m_year["year"].notna()
150+
if mask_year.any():
151+
yval = m_year["year"].fillna("").astype(str).str.strip()
152+
iss = m_year["iss"].fillna("").astype(str).str.strip()
153+
154+
# set year if empty
155+
mask_set_year = mask_year & (yr == "") & (yval != "")
156+
split_df.loc[mask_set_year, "year"] = yval[mask_set_year]
157+
158+
# set number from iss if number empty and iss exists and not monthish
159+
mask_set_num = mask_year & (num == "") & (iss != "") & (~iss.map(_is_monthish))
160+
split_df.loc[mask_set_num, "number"] = iss[mask_set_num].map(
161+
_normalize_supplement
162+
)
163+
164+
# clear volume (because it was a year)
165+
split_df.loc[mask_year, "volume"] = ""
166+
167+
num = s("number")
168+
yr = s("year")
169+
170+
# 5) Main pattern: "V (X)" where X may include nested parentheses like "2(2)"
171+
vol_now = s("volume")
172+
173+
# OLD (breaks on nested parens):
174+
# m_vol_issue = vol_now.str.extract(r"^(?P<vol>[A-Za-z0-9]+)\s*\(\s*(?P<iss>[^)]+?)\s*\)$")
175+
176+
# NEW (captures everything up to the last ')'):
177+
m_vol_issue = vol_now.str.extract(
178+
r"^(?P<vol>[A-Za-z0-9]+)\s*\(\s*(?P<iss>.+)\s*\)$"
179+
)
180+
181+
mask_vol_issue = m_vol_issue["vol"].notna()
182+
if mask_vol_issue.any():
183+
v = m_vol_issue["vol"].fillna("").astype(str).str.strip()
184+
iss = m_vol_issue["iss"].fillna("").astype(str).str.strip()
185+
186+
# always set volume to the leading part
187+
split_df.loc[mask_vol_issue, "volume"] = v[mask_vol_issue]
188+
189+
# set number only if empty and issue isn't monthish
190+
mask_set_num = (
191+
mask_vol_issue & (num == "") & (iss != "") & (~iss.map(_is_monthish))
192+
)
193+
split_df.loc[mask_set_num, "number"] = iss[mask_set_num].map(
194+
_normalize_supplement
195+
)
196+
197+
num = s("number")
198+
199+
# 6) If any residual "(no pagination)" became empty-only markers, ensure cleaned
200+
split_df["volume"] = s("volume")
201+
split_df["number"] = s("number")
202+
split_df["pages"] = s("pages")
203+
204+
# 7) If a field is now literally "no pagination" (without parens), drop it too (rare)
205+
for col in ("volume", "number", "pages"):
206+
split_df.loc[
207+
split_df[col]
208+
.fillna("")
209+
.astype(str)
210+
.str.strip()
211+
.str.lower()
212+
.eq("no pagination"),
213+
col,
214+
] = ""
215+
216+
return

0 commit comments

Comments
 (0)