|
| 1 | +#! /usr/bin/env python |
| 2 | +"""Preparation of misaligned schemata""" |
| 3 | +import re |
| 4 | + |
| 5 | +import pandas as pd |
| 6 | + |
| 7 | + |
| 8 | +_MONTH_TOKENS = { |
| 9 | + "jan", |
| 10 | + "january", |
| 11 | + "feb", |
| 12 | + "february", |
| 13 | + "mar", |
| 14 | + "march", |
| 15 | + "apr", |
| 16 | + "april", |
| 17 | + "may", |
| 18 | + "jun", |
| 19 | + "june", |
| 20 | + "jul", |
| 21 | + "july", |
| 22 | + "aug", |
| 23 | + "august", |
| 24 | + "sep", |
| 25 | + "sept", |
| 26 | + "september", |
| 27 | + "oct", |
| 28 | + "october", |
| 29 | + "nov", |
| 30 | + "november", |
| 31 | + "dec", |
| 32 | + "december", |
| 33 | +} |
| 34 | + |
| 35 | + |
| 36 | +def _strip_no_pagination(text: str) -> str: |
| 37 | + """Remove '(no pagination)' fragments without setting pages.""" |
| 38 | + if not text: |
| 39 | + return "" |
| 40 | + # remove any occurrence like "(no pagination)" with flexible whitespace/case |
| 41 | + text = re.sub(r"\(\s*no\s+pagination\s*\)", "", text, flags=re.IGNORECASE) |
| 42 | + return re.sub(r"\s+", " ", text).strip() |
| 43 | + |
| 44 | + |
| 45 | +def _is_monthish(token: str) -> bool: |
| 46 | + """Return True if token looks like a month/season/date label (to be ignored).""" |
| 47 | + if not token: |
| 48 | + return False |
| 49 | + t = token.strip().lower() |
| 50 | + # remove punctuation for month detection |
| 51 | + t_clean = re.sub(r"[^a-z0-9 ]", " ", t) |
| 52 | + parts = [p for p in t_clean.split() if p] |
| 53 | + if not parts: |
| 54 | + return False |
| 55 | + # if any part is a month token -> treat as monthish |
| 56 | + if any(p in _MONTH_TOKENS for p in parts): |
| 57 | + return True |
| 58 | + # patterns like "3 aug" (month abbrev without parentheses) |
| 59 | + if len(parts) == 2 and parts[1] in _MONTH_TOKENS and parts[0].isdigit(): |
| 60 | + return True |
| 61 | + return False |
| 62 | + |
| 63 | + |
| 64 | +def _normalize_supplement(token: str) -> str: |
| 65 | + """Normalize common supplement formats lightly (keep informative text).""" |
| 66 | + if not token: |
| 67 | + return "" |
| 68 | + |
| 69 | + t = token.strip() |
| 70 | + t = re.sub(r"\s+", " ", t) |
| 71 | + |
| 72 | + # SUPPL. 1 -> SUPPL.1 ; SUPPL.1 -> SUPPL.1 |
| 73 | + t = re.sub(r"(?i)\bSUPPL\.?\s*(\d+)\b", r"SUPPL.\1", t) |
| 74 | + |
| 75 | + # "Supplement3" -> "Supplement 3" |
| 76 | + t = re.sub(r"(?i)\bSupplement\s*([0-9]+)\b", r"Supplement \1", t) |
| 77 | + t = re.sub(r"(?i)\bSupplement([0-9]+)\b", r"Supplement \1", t) |
| 78 | + |
| 79 | + # "SPEC.ISS 1" / "Spec.Iss 1" -> "Spec.Iss 1" |
| 80 | + t = re.sub(r"(?i)\bSPEC\.?\s*ISS\.?\s*(\d+)\b", r"Spec.Iss \1", t) |
| 81 | + |
| 82 | + return t.strip() |
| 83 | + |
| 84 | + |
| 85 | +def fix_schema_misalignments(split_df: pd.DataFrame) -> None: |
| 86 | + """ |
| 87 | + Fix common schema misalignments where volume/number/pages contain mixed content. |
| 88 | +
|
| 89 | + Updated rules (per request): |
| 90 | + - '(no pagination)' is removed wherever it appears, but pages MUST NOT be set. |
| 91 | + - Month-like tokens (JAN, FEBRUARY 2012, '(7 JUL)', etc.) are removed/ignored. |
| 92 | + - "Strange large issue" values are not treated specially (left as-is if parsed). |
| 93 | + - Function mutates split_df in-place and returns None. |
| 94 | + """ |
| 95 | + if split_df.empty: |
| 96 | + return |
| 97 | + |
| 98 | + # ensure columns exist |
| 99 | + for col in ("volume", "number", "pages", "year"): |
| 100 | + if col not in split_df.columns: |
| 101 | + split_df[col] = "" |
| 102 | + |
| 103 | + # helper to get safe string series |
| 104 | + def s(col: str) -> pd.Series: |
| 105 | + return split_df[col].fillna("").astype(str).str.strip() |
| 106 | + |
| 107 | + # 1) strip '(no pagination)' everywhere (volume/number/pages) |
| 108 | + split_df["volume"] = s("volume").map(_strip_no_pagination) |
| 109 | + split_df["number"] = s("number").map(_strip_no_pagination) |
| 110 | + split_df["pages"] = s("pages").map(_strip_no_pagination) |
| 111 | + |
| 112 | + num = s("number") |
| 113 | + pag = s("pages") |
| 114 | + yr = s("year") |
| 115 | + |
| 116 | + # 2) If pages is like "(1)" or "(4)" -> move into number if empty, clear pages |
| 117 | + # Also handle "(1) (no pagination)" already stripped to "(1)" above. |
| 118 | + m_pages_issue = pag.str.extract(r"^\(\s*(?P<iss>[^)]+?)\s*\)$") |
| 119 | + mask_pages_issue = m_pages_issue["iss"].notna() |
| 120 | + if mask_pages_issue.any(): |
| 121 | + issue_val = m_pages_issue["iss"].fillna("").astype(str).str.strip() |
| 122 | + # ignore monthish issue labels |
| 123 | + mask_set = mask_pages_issue & (num == "") & (~issue_val.map(_is_monthish)) |
| 124 | + split_df.loc[mask_set, "number"] = issue_val[mask_set].map( |
| 125 | + _normalize_supplement |
| 126 | + ) |
| 127 | + split_df.loc[mask_pages_issue, "pages"] = "" # clear pages (don't set to 1) |
| 128 | + # refresh |
| 129 | + num = s("number") |
| 130 | + pag = s("pages") |
| 131 | + |
| 132 | + # 3) Volume-only "(4)" -> issue without volume: set number if empty; clear volume |
| 133 | + # Also handle "(7 JUL)" monthish -> drop |
| 134 | + vol_now = s("volume") |
| 135 | + m_only_paren = vol_now.str.extract(r"^\(\s*(?P<tok>[^)]+?)\s*\)$") |
| 136 | + mask_only_paren = m_only_paren["tok"].notna() |
| 137 | + if mask_only_paren.any(): |
| 138 | + tok = m_only_paren["tok"].fillna("").astype(str).str.strip() |
| 139 | + mask_set = mask_only_paren & (num == "") & (~tok.map(_is_monthish)) |
| 140 | + split_df.loc[mask_set, "number"] = tok[mask_set].map(_normalize_supplement) |
| 141 | + # always clear volume if it was only "(...)" (monthish or not) |
| 142 | + split_df.loc[mask_only_paren, "volume"] = "" |
| 143 | + num = s("number") |
| 144 | + |
| 145 | + # 4) Year stored where volume should be: "2017 (10)" or "2017" |
| 146 | + # If year field empty, copy year. If parentheses after year look like issue, move to number. |
| 147 | + vol_now = s("volume") |
| 148 | + m_year = vol_now.str.extract(r"^(?P<year>\d{4})(?:\s*\(\s*(?P<iss>[^)]+?)\s*\))?$") |
| 149 | + mask_year = m_year["year"].notna() |
| 150 | + if mask_year.any(): |
| 151 | + yval = m_year["year"].fillna("").astype(str).str.strip() |
| 152 | + iss = m_year["iss"].fillna("").astype(str).str.strip() |
| 153 | + |
| 154 | + # set year if empty |
| 155 | + mask_set_year = mask_year & (yr == "") & (yval != "") |
| 156 | + split_df.loc[mask_set_year, "year"] = yval[mask_set_year] |
| 157 | + |
| 158 | + # set number from iss if number empty and iss exists and not monthish |
| 159 | + mask_set_num = mask_year & (num == "") & (iss != "") & (~iss.map(_is_monthish)) |
| 160 | + split_df.loc[mask_set_num, "number"] = iss[mask_set_num].map( |
| 161 | + _normalize_supplement |
| 162 | + ) |
| 163 | + |
| 164 | + # clear volume (because it was a year) |
| 165 | + split_df.loc[mask_year, "volume"] = "" |
| 166 | + |
| 167 | + num = s("number") |
| 168 | + yr = s("year") |
| 169 | + |
| 170 | + # 5) Main pattern: "V (X)" where X may include nested parentheses like "2(2)" |
| 171 | + vol_now = s("volume") |
| 172 | + |
| 173 | + # OLD (breaks on nested parens): |
| 174 | + # m_vol_issue = vol_now.str.extract(r"^(?P<vol>[A-Za-z0-9]+)\s*\(\s*(?P<iss>[^)]+?)\s*\)$") |
| 175 | + |
| 176 | + # NEW (captures everything up to the last ')'): |
| 177 | + m_vol_issue = vol_now.str.extract( |
| 178 | + r"^(?P<vol>[A-Za-z0-9]+)\s*\(\s*(?P<iss>.+)\s*\)$" |
| 179 | + ) |
| 180 | + |
| 181 | + mask_vol_issue = m_vol_issue["vol"].notna() |
| 182 | + if mask_vol_issue.any(): |
| 183 | + v = m_vol_issue["vol"].fillna("").astype(str).str.strip() |
| 184 | + iss = m_vol_issue["iss"].fillna("").astype(str).str.strip() |
| 185 | + |
| 186 | + # always set volume to the leading part |
| 187 | + split_df.loc[mask_vol_issue, "volume"] = v[mask_vol_issue] |
| 188 | + |
| 189 | + # set number only if empty and issue isn't monthish |
| 190 | + mask_set_num = ( |
| 191 | + mask_vol_issue & (num == "") & (iss != "") & (~iss.map(_is_monthish)) |
| 192 | + ) |
| 193 | + split_df.loc[mask_set_num, "number"] = iss[mask_set_num].map( |
| 194 | + _normalize_supplement |
| 195 | + ) |
| 196 | + |
| 197 | + num = s("number") |
| 198 | + |
| 199 | + # 6) If any residual "(no pagination)" became empty-only markers, ensure cleaned |
| 200 | + split_df["volume"] = s("volume") |
| 201 | + split_df["number"] = s("number") |
| 202 | + split_df["pages"] = s("pages") |
| 203 | + |
| 204 | + # 7) If a field is now literally "no pagination" (without parens), drop it too (rare) |
| 205 | + for col in ("volume", "number", "pages"): |
| 206 | + split_df.loc[ |
| 207 | + split_df[col] |
| 208 | + .fillna("") |
| 209 | + .astype(str) |
| 210 | + .str.strip() |
| 211 | + .str.lower() |
| 212 | + .eq("no pagination"), |
| 213 | + col, |
| 214 | + ] = "" |
| 215 | + |
| 216 | + return |
0 commit comments