Merge pull request #229 from ipums/make-v4.2.2

riley-harper · web-flow · commit ec89783f4251 · 2026-01-20T09:44:34.000-06:00
Bump the version to 4.2.2
diff --git a/examples/tutorial/tutorial.py b/examples/tutorial/tutorial.py
@@ -9,17 +9,15 @@
 
 
 def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        description="""
+    parser = argparse.ArgumentParser(description="""
         This script links two very small example datasets that live in the data
         subdirectory. It reads in the tutorial_config.toml configuration file
         and runs hlink's preprocessing and matching steps to find some potential
         matches between the two datasets.
 
         For a detailed walkthrough of the tutorial, please see the README.md
         file in the same directory as this script.
-        """
-    )
+        """)
 
     parser.add_argument(
         "--clean", action="store_true", help="drop existing Spark tables on startup"
diff --git a/hlink/linking/core/column_mapping.py b/hlink/linking/core/column_mapping.py
@@ -79,7 +79,6 @@ def transform_reverse(input_col: Column, transform: Mapping[str, Any], context:
 )
 from pyspark.sql.types import LongType
 
-
 ColumnMappingTransform: TypeAlias = Callable[
     [Column, Mapping[str, Any], Mapping[str, Any]], Column
 ]
diff --git a/hlink/linking/hh_matching/link_step_block_on_households.py b/hlink/linking/hh_matching/link_step_block_on_households.py
@@ -10,7 +10,6 @@
 from hlink.linking.link_step import LinkStep
 from hlink.linking.util import set_job_description
 
-
 logger = logging.getLogger(__name__)
 
 
diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -843,11 +843,11 @@ def _aggregate_per_threshold_results(
     mcc = [r.mcc for r in prediction_results if not math.isnan(r.mcc)]
     f_measure = [r.f_measure for r in prediction_results if not math.isnan(r.f_measure)]
 
-    (precision_mean, precision_sd) = _compute_mean_and_stdev(precision)
-    (recall_mean, recall_sd) = _compute_mean_and_stdev(recall)
-    (pr_auc_mean, pr_auc_sd) = _compute_mean_and_stdev(pr_auc)
-    (mcc_mean, mcc_sd) = _compute_mean_and_stdev(mcc)
-    (f_measure_mean, f_measure_sd) = _compute_mean_and_stdev(f_measure)
+    precision_mean, precision_sd = _compute_mean_and_stdev(precision)
+    recall_mean, recall_sd = _compute_mean_and_stdev(recall)
+    pr_auc_mean, pr_auc_sd = _compute_mean_and_stdev(pr_auc)
+    mcc_mean, mcc_sd = _compute_mean_and_stdev(mcc)
+    f_measure_mean, f_measure_sd = _compute_mean_and_stdev(f_measure)
 
     new_desc = pd.DataFrame(
         {
@@ -962,17 +962,15 @@ def _handle_param_grid_attribute(training_settings: dict[str, Any]) -> dict[str,
 def _get_model_parameters(training_settings: dict[str, Any]) -> list[dict[str, Any]]:
     if "param_grid" in training_settings:
         print(
-            dedent(
-                """\
+            dedent("""\
                 Deprecation Warning: training.param_grid is deprecated.
 
                 Please use training.model_parameter_search instead by replacing
 
                 `param_grid = True` with `model_parameter_search = {strategy = "grid"}` or
                 `param_grid = False` with `model_parameter_search = {strategy = "explicit"}`
 
-                [deprecated_in_version=4.0.0]"""
-            ),
+                [deprecated_in_version=4.0.0]"""),
             file=sys.stderr,
         )
 
diff --git a/hlink/linking/training/link_step_create_comparison_features.py b/hlink/linking/training/link_step_create_comparison_features.py
@@ -42,8 +42,7 @@ def _create_training_features(self):
         dep_var = config[training_conf]["dependent_var"]
         if training_conf == "hh_training":
             hh_col = config[training_conf].get("hh_col", "serialp")
-            tdl = self.task.spark.sql(
-                f"""SELECT
+            tdl = self.task.spark.sql(f"""SELECT
                                     td.{id_col}_a,
                                     td.{id_col}_b,
                                     td.{dep_var},
@@ -57,8 +56,7 @@ def _create_training_features(self):
                                     left join
                                     prepped_df_b pdfb
                                     on pdfb.{id_col} = td.{id_col}_b
-                                """
-            )
+                                """)
         else:
             tdl = self.task.spark.table(f"{table_prefix}training_data").select(
                 f"{id_col}_a", f"{id_col}_b", dep_var
diff --git a/hlink/linking/util.py b/hlink/linking/util.py
@@ -1,7 +1,6 @@
 from contextlib import contextmanager
 from math import ceil
 
-
 MIN_PARTITIONS = 200
 MAX_PARTITIONS = 10000
 
diff --git a/hlink/tests/core/column_mapping_test.py b/hlink/tests/core/column_mapping_test.py
@@ -5,7 +5,6 @@
 
 from hlink.linking.core.column_mapping import apply_transform, select_column_mapping
 
-
 TEST_DF_1 = pd.DataFrame(
     {
         "id": [0, 1, 2, 3, 4, 5],
diff --git a/hlink/tests/core/substitutions_test.py b/hlink/tests/core/substitutions_test.py
@@ -24,13 +24,11 @@ def test_load_substitutions(tmp_path: Path) -> None:
 
 def test_generate_substitutions(spark: SparkSession, tmp_path: Path) -> None:
     tmp_file = tmp_path / "substitutions.csv"
-    tmp_file.write_text(
-        """rose,rosie
+    tmp_file.write_text("""rose,rosie
         sophia,sophy
         sophia,sofia
         amanda,mandy
-        jane,jean"""
-    )
+        jane,jean""")
 
     df = spark.createDataFrame(
         [("agnes", 2), ("mandy", 2), ("sophy", 2), ("rosie", 2), ("jean", 1)],
diff --git a/hlink/tests/core/transforms_test.py b/hlink/tests/core/transforms_test.py
@@ -5,7 +5,6 @@
 from hlink.linking.core.transforms import apply_transform, generate_transforms
 from hlink.linking.link_task import LinkTask
 
-
 ignore_apply_transform_dep_warning = pytest.mark.filterwarnings(
     r"ignore:\s*This is a deprecated alias for hlink.linking.core.column_mapping.apply_transform"
 )
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "hlink"
-version = "4.2.1"
+version = "4.2.2"
 description = "Fast supervised pyspark record linkage software"
 readme = "README.md"
 requires-python = ">=3.10"
diff --git a/sphinx-docs/changelog.md b/sphinx-docs/changelog.md
@@ -3,7 +3,7 @@
 The format of this changelog is based on [Keep A Changelog][keep-a-changelog].
 Hlink adheres to semantic versioning as much as possible.
 
-## Not Yet Released
+## v4.2.2 (2026-01-20)
 
 ### Added
 

Original file line number	Diff line number	Diff line change
`@@ -79,7 +79,6 @@ def transform_reverse(input_col: Column, transform: Mapping[str, Any], context:`
`79`	`79`	`)`
`80`	`80`	`from pyspark.sql.types import LongType`
`81`	`81`
`82`		`-`
`83`	`82`	`ColumnMappingTransform: TypeAlias = Callable[`
`84`	`83`	`[Column, Mapping[str, Any], Mapping[str, Any]], Column`
`85`	`84`	`]`
Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,6 @@`
`5`	`5`
`6`	`6`	`from hlink.linking.core.column_mapping import apply_transform, select_column_mapping`
`7`	`7`
`8`		`-`
`9`	`8`	`TEST_DF_1 = pd.DataFrame(`
`10`	`9`	`{`
`11`	`10`	`"id": [0, 1, 2, 3, 4, 5],`
Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,6 @@`
`5`	`5`	`from hlink.linking.core.transforms import apply_transform, generate_transforms`
`6`	`6`	`from hlink.linking.link_task import LinkTask`
`7`	`7`
`8`		`-`
`9`	`8`	`ignore_apply_transform_dep_warning = pytest.mark.filterwarnings(`
`10`	`9`	`r"ignore:\s*This is a deprecated alias for hlink.linking.core.column_mapping.apply_transform"`
`11`	`10`	`)`