Skip to content

Commit 5a89715

Browse files
author
Gerit Wagner
committed
revise prep (SettingWithCopyWarning)
1 parent 3e9d6fe commit 5a89715

File tree

1 file changed

+14
-6
lines changed

1 file changed

+14
-6
lines changed

bib_dedupe/prep.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -157,12 +157,20 @@ def __general_prep(records_df: pd.DataFrame) -> pd.DataFrame:
157157
if optional_field not in records_df.columns:
158158
records_df = records_df.assign(**{optional_field: ""})
159159

160-
records_df = records_df.drop(
161-
labels=list(records_df.columns.difference(ALL_FIELDS)),
162-
axis=1,
163-
)
164-
records_df.loc[:, CONTAINER_TITLE] = ""
165-
records_df.loc[:, ALL_FIELDS] = records_df[ALL_FIELDS].astype(str)
160+
# ensure the container title exists and is string-typed
161+
if CONTAINER_TITLE not in records_df.columns:
162+
records_df[CONTAINER_TITLE] = pd.Series(
163+
"", index=records_df.index, dtype="string"
164+
)
165+
166+
# keep only the fields of interest
167+
records_df = records_df.loc[:, ALL_FIELDS].copy()
168+
169+
# cast the target columns to pandas StringDtype
170+
records_df = records_df.astype({col: "string" for col in ALL_FIELDS}, copy=False)
171+
172+
# replace pd.NA with empty strings so regex/string ops don't see NAType
173+
records_df.loc[:, ALL_FIELDS] = records_df.loc[:, ALL_FIELDS].fillna("")
166174

167175
return records_df
168176

0 commit comments

Comments
 (0)