Skip to content

Commit 6769131

Browse files
authored
Merge pull request #184 from ipums/remove-deprecated
Remove deprecated code for version 4
2 parents 7f802db + 2004b2e commit 6769131

9 files changed

Lines changed: 92 additions & 133 deletions

File tree

hlink/linking/core/transforms.py

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -515,21 +515,14 @@ def apply_transform(
515515
return column_select[transform["value"]]
516516
elif transform_type == "mapping":
517517
mapped_column = column_select
518-
if transform.get("values", False):
519-
print(
520-
"DEPRECATION WARNING: The 'mapping' transform no longer takes the 'values' parameter with a list of mappings in dictionaries; instead each mapping should be its own transform. Please change your config for future releases."
521-
)
522-
for mapping in transform["values"]:
523-
from_regexp = "|".join(f"^{from_val}$" for from_val in mapping["from"])
524-
mapped_column = regexp_replace(
525-
mapped_column, from_regexp, str(mapping["to"])
526-
)
527-
else:
528-
for key, value in transform["mappings"].items():
529-
from_regexp = f"^{key}$"
530-
mapped_column = regexp_replace(mapped_column, from_regexp, str(value))
518+
519+
for key, value in transform["mappings"].items():
520+
from_regexp = f"^{key}$"
521+
mapped_column = regexp_replace(mapped_column, from_regexp, str(value))
522+
531523
if transform.get("output_type", False) == "int":
532524
mapped_column = mapped_column.cast(LongType())
525+
533526
return mapped_column
534527
elif transform_type == "swap_words":
535528
mapped_column = column_select

hlink/linking/matching/_helpers.py

Lines changed: 0 additions & 14 deletions
This file was deleted.

hlink/linking/matching/link_step_explode.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
from pyspark.sql.functions import array, explode, col
1010

1111
import hlink.linking.core.comparison as comparison_core
12-
from . import _helpers as matching_helpers
1312
from hlink.linking.link_step import LinkStep
1413

1514

@@ -41,7 +40,7 @@ def _run(self):
4140
)
4241

4342
# self.spark.sql("set spark.sql.shuffle.partitions=4000")
44-
blocking = matching_helpers.get_blocking(config)
43+
blocking = config["blocking"]
4544

4645
self.task.run_register_python(
4746
name="exploded_df_a",

hlink/linking/matching/link_step_match.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
import hlink.linking.core.dist_table as dist_table_core
1212
import hlink.linking.core.comparison as comparison_core
1313
from hlink.linking.util import spark_shuffle_partitions_heuristic
14-
from . import _helpers as matching_helpers
1514

1615
from hlink.linking.link_step import LinkStep
1716

@@ -83,7 +82,7 @@ def _run(self):
8382
f"Dataset sizes are A={dataset_size_a}, B={dataset_size_b}, so set Spark partitions to {num_partitions} for this step"
8483
)
8584

86-
blocking = matching_helpers.get_blocking(config)
85+
blocking = config["blocking"]
8786

8887
t_ctx = {}
8988
if config.get("comparisons", False):

hlink/linking/preprocessing/link_step_prep_dataframes.py

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -95,17 +95,8 @@ def _prep_dataframe(
9595
df_selected = df
9696
spark = self.task.spark
9797
column_selects = [col(id_column)]
98-
if column_definitions and isinstance(column_definitions[0], list):
99-
print(
100-
"DEPRECATION WARNING: The config value 'column_mappings' is no longer a nested (double) array and is now an array of objects. Please change your config for future releases."
101-
)
102-
flat_column_mappings = [
103-
item for sublist in column_definitions for item in sublist
104-
]
105-
else:
106-
flat_column_mappings = column_definitions
10798

108-
for column_mapping in flat_column_mappings:
99+
for column_mapping in column_definitions:
109100
df_selected, column_selects = column_mapping_core.select_column_mapping(
110101
column_mapping, df_selected, is_a, column_selects
111102
)

hlink/linking/transformers/interaction_transformer.py

Lines changed: 0 additions & 72 deletions
This file was deleted.

hlink/tests/core/transforms_test.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,3 +343,71 @@ def test_apply_transform_error_when_unrecognized_transform_type(is_a: bool) -> N
343343
transform = {"type": "not_supported"}
344344
with pytest.raises(ValueError, match="Invalid transform type"):
345345
apply_transform(column_select, transform, is_a)
346+
347+
348+
@pytest.mark.parametrize("is_a", [True, False])
349+
def test_apply_transform_mapping(spark: SparkSession, is_a: bool) -> None:
350+
transform = {"type": "mapping", "mappings": {"first": "abcd", "second": "efg"}}
351+
input_col = col("input")
352+
output_col = apply_transform(input_col, transform, is_a)
353+
354+
df = spark.createDataFrame(
355+
[
356+
["first"],
357+
["second"],
358+
["third"],
359+
["secondagain"],
360+
],
361+
"input:string",
362+
)
363+
364+
transformed = df.select(output_col.alias("output"))
365+
rows = transformed.collect()
366+
367+
# Note that the mapping must exactly match the value to transform it, so the
368+
# value "secondagain" is unchanged.
369+
assert rows == [
370+
Row(output="abcd"),
371+
Row(output="efg"),
372+
Row(output="third"),
373+
Row(output="secondagain"),
374+
]
375+
376+
377+
@pytest.mark.parametrize("is_a", [True, False])
378+
def test_apply_transform_mapping_integer_column(
379+
spark: SparkSession, is_a: bool
380+
) -> None:
381+
"""
382+
The mapping transform works over integer columns, and you can cast the output
383+
to an integer by passing output_type = "int".
384+
"""
385+
transform = {
386+
"type": "mapping",
387+
"mappings": {"1": "10", "2": "30", "3": ""},
388+
"output_type": "int",
389+
}
390+
input_col = col("input")
391+
output_col = apply_transform(input_col, transform, is_a)
392+
393+
df = spark.createDataFrame(
394+
[
395+
[5],
396+
[4],
397+
[3],
398+
[2],
399+
[1],
400+
],
401+
"input:integer",
402+
)
403+
404+
transformed = df.select(output_col.alias("output"))
405+
rows = transformed.collect()
406+
407+
assert rows == [
408+
Row(output=5),
409+
Row(output=4),
410+
Row(output=None),
411+
Row(output=30),
412+
Row(output=10),
413+
]

hlink/tests/matching_comparison_features_test.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -654,10 +654,9 @@ def test_step_2_jaro_winkler_rate(
654654
)["neighbor_namelast_jw_rate_threshold"].iloc[0]
655655

656656

657-
def test_step_2_JW_double_array_blocking_conf(spark, matching_conf, matching, capsys):
657+
def test_step_2_JW_with_blocking(spark, matching_conf, matching):
658658
"""Test matching step 2 to ensure that comparison features are generated (can a regular comparison (as represented by J/W) still run if there's NOT a distance lookup feature)"""
659-
matching_conf["blocking_steps"] = [[{"column_name": "sex"}]]
660-
matching_conf.pop("blocking")
659+
matching_conf["blocking"] = [{"column_name": "sex"}]
661660

662661
matching_conf["comparison_features"] = [
663662
{
@@ -685,12 +684,6 @@ def test_step_2_JW_double_array_blocking_conf(spark, matching_conf, matching, ca
685684
> 0.87
686685
)
687686

688-
captured = capsys.readouterr()
689-
assert (
690-
"DEPRECATION WARNING: The config value 'blocking_steps' has been renamed to 'blocking' and is now just a single array of objects."
691-
in captured.out
692-
)
693-
694687

695688
def test_step_2_comparison_features_comp_c_and_caution(
696689
spark, matching_comparison_conf, matching

sphinx-docs/column_mappings.md

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -288,25 +288,27 @@ transforms = [
288288

289289
### mapping
290290

291-
Map single or multiple values to a single output value, otherwise known as a "recoding."
291+
Explicitly map from input values to output values. This is also known as a "recoding".
292+
Input values which do not appear in the mapping are unchanged. By default, the output
293+
column is of type string, but you can set `output_type = "int"` to cast the output
294+
column to type integer instead.
292295

293296
Maps T → U.
294297

295-
```
298+
```toml
296299
[[column_mappings]]
297300
column_name = "birthyr"
298301
alias = "clean_birthyr"
299-
transforms = [
300-
{
301-
type = "mapping",
302-
values = [
303-
{"from"=[9999,1999], "to" = ""},
304-
{"from" = -9998, "to" = 9999}
305-
]
306-
}
307-
]
302+
303+
[[column_mappings.transforms]]
304+
type = "mapping"
305+
mappings = {9999 = "", 1999 = "", "-9998" = "9999"}
306+
output_type = "int"
308307
```
309308

309+
*Changed in version 4.0.0: The deprecated `values` key is no longer supported.
310+
Please use the `mappings` key documented above instead.*
311+
310312
### substring
311313

312314
Replace a column with a substring of the data in the column.

0 commit comments

Comments
 (0)