Skip to content

Commit 8f0a477

Browse files
author
Gerit Wagner
committed
update
1 parent 09e63c2 commit 8f0a477

File tree

4 files changed

+207
-185
lines changed

4 files changed

+207
-185
lines changed

bib_dedupe/prep.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@
4242
from bib_dedupe.prep_volume import prep_volume
4343
from bib_dedupe.prep_year import prep_year
4444

45-
4645
pd.set_option("future.no_silent_downcasting", True)
4746

4847
REQUIRED_FIELDS = [ID, ENTRYTYPE, TITLE, AUTHOR, YEAR]
@@ -82,12 +81,10 @@ def prepare_df_split(split_df: pd.DataFrame) -> pd.DataFrame:
8281
Returns:
8382
The processed dataframe.
8483
"""
84+
85+
# Substring replacements (anywhere in the string)
8586
split_df.replace(
8687
to_replace={
87-
"UNKNOWN": "",
88-
"n/a": "",
89-
"N/A": "",
90-
"NA": "",
9188
"&": "and",
9289
" & ": " and ",
9390
" + ": " and ",
@@ -97,6 +94,27 @@ def prepare_df_split(split_df: pd.DataFrame) -> pd.DataFrame:
9794

9895
set_container_title(split_df)
9996

97+
# Whole-string only replacements (case-insensitive)
98+
cols = [
99+
AUTHOR,
100+
TITLE,
101+
CONTAINER_TITLE,
102+
YEAR,
103+
VOLUME,
104+
NUMBER,
105+
PAGES,
106+
ABSTRACT,
107+
DOI,
108+
]
109+
# column-wise (Series.str, not DataFrame.str)
110+
norm2 = split_df[cols].apply(
111+
lambda col: col.astype("string")
112+
.str.strip()
113+
.str.upper()
114+
.str.replace("/", "", regex=False)
115+
)
116+
split_df[cols] = split_df[cols].mask(norm2.isin(["UNKNOWN", "NA"]), "")
117+
100118
split_df["author_full"] = split_df[AUTHOR]
101119

102120
fix_schema_misalignments(split_df)

0 commit comments

Comments
 (0)