drop records with empty titles in block (not prep)

Gerit Wagner · Gerit Wagner · commit 68fa9464eccb · 2025-10-18T07:34:37.000+02:00
Rationale:
- The `prep()` method is not expected to remove records
- The changes prevent errors in the following scenario:

// When users replace records_df with the prepared records
records_df = prep(records_df)
actual_blocked_df = block(records_df)
matched_df = match(actual_blocked_df)
duplicate_id_sets = cluster(matched_df)
// The records_df would be missing records (without titles),
// effectively producing false positives (FPs):
merged_df = merge(records_df, duplicate_id_sets=duplicate_id_sets)
// This error may easily be unnoticed.

When records are removed in the `block()` method, this error could
be prevented because actual_blocked_df has a different structure
and mis-assignments would raise errors. The resulting merged_df
would be formatted (prepared) but no records would be missing.
diff --git a/bib_dedupe/block.py b/bib_dedupe/block.py
@@ -18,6 +18,7 @@
 from bib_dedupe.constants.fields import TITLE_SHORT
 from bib_dedupe.constants.fields import VOLUME
 from bib_dedupe.constants.fields import YEAR
+from bib_dedupe.constants.fields import TITLE
 
 block_fields_list = [
     {AUTHOR_FIRST, YEAR},
@@ -241,6 +242,13 @@ def block(records_df: pd.DataFrame, cpu: int = -1) -> pd.DataFrame:
     )
     start_time = time.time()
 
+    if records_df[TITLE].isnull().any():
+        verbose_print.print(
+            "Warning: Some records have empty title field. These records will not be considered."
+        )
+        records_df = records_df.dropna(subset=[TITLE])
+
+
     pairs_df = pd.DataFrame(columns=["ID_1", "ID_2", "require_title_overlap"])
     pairs_df = pairs_df.astype(
         {"ID_1": str, "ID_2": str, "require_title_overlap": bool}
diff --git a/bib_dedupe/prep.py b/bib_dedupe/prep.py
@@ -138,11 +138,6 @@ def __general_prep(records_df: pd.DataFrame) -> pd.DataFrame:
         records_df[column] = records_df[column].replace(
             ["#NAME?", "UNKNOWN", ""], np.nan
         )
-    if records_df[TITLE].isnull().any():
-        verbose_print.print(
-            "Warning: Some records have empty title field. These records will not be considered."
-        )
-        records_df = records_df.dropna(subset=[TITLE])
 
     # if columns are of type float, we need to avoid casting "3.0" to "30"
     for col in records_df.columns: