Skip to content

Commit 85a342f

Browse files
authored
Keep Pandas datatypes when using pyarrow dtypes (#459)
1 parent 6aa5759 commit 85a342f

File tree

9 files changed

+82
-7
lines changed

9 files changed

+82
-7
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "dapla-toolbelt-pseudo"
3-
version = "4.3.3"
3+
version = "4.4.0"
44
description = "Pseudonymization extensions for Dapla"
55
authors = ["Dapla Developers <dapla-platform-developers@ssb.no>"]
66
license = "MIT"

src/dapla_pseudo/v1/baseclasses.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from concurrent.futures import ThreadPoolExecutor
1111
from datetime import date
1212

13+
import pandas as pd
1314
import polars as pl
1415
from dapla_metadata.datasets.core import Datadoc
1516

@@ -65,6 +66,7 @@ def _execute_pseudo_operation(
6566
custom_keyset: PseudoKeyset | str | None = None,
6667
target_custom_keyset: PseudoKeyset | str | None = None, # used in repseudo
6768
target_rules: list[PseudoRule] | None = None, # used in repseudo
69+
schema: pd.Series | pl.Schema | None = None,
6870
) -> Result:
6971
if self._dataset is None:
7072
raise ValueError("No dataset has been provided.")
@@ -92,6 +94,7 @@ def _execute_pseudo_operation(
9294
for pseudo_rule in (target_rules if target_rules else rules)
9395
],
9496
user_provided_metadata=self._user_provided_metadata,
97+
schema=schema,
9598
)
9699

97100
def _pseudonymize_field(

src/dapla_pseudo/v1/depseudo.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,17 +26,20 @@ class Depseudonymize:
2626
"""
2727

2828
dataset: pl.DataFrame
29+
schema: pd.Series | pl.Schema
2930

3031
@staticmethod
3132
def from_pandas(dataframe: pd.DataFrame) -> "Depseudonymize._Depseudonymizer":
3233
"""Initialize a depseudonymization request from a pandas DataFrame."""
3334
Depseudonymize.dataset = pl.from_pandas(dataframe)
35+
Depseudonymize.schema = dataframe.dtypes
3436
return Depseudonymize._Depseudonymizer()
3537

3638
@staticmethod
3739
def from_polars(dataframe: pl.DataFrame) -> "Depseudonymize._Depseudonymizer":
3840
"""Initialize a depseudonymization request from a polars DataFrame."""
3941
Depseudonymize.dataset = dataframe
42+
Depseudonymize.schema = dataframe.schema
4043
return Depseudonymize._Depseudonymizer()
4144

4245
class _Depseudonymizer(_BasePseudonymizer):
@@ -92,7 +95,7 @@ def run(
9295
)
9396

9497
result = super()._execute_pseudo_operation(
95-
self.rules, timeout, custom_keyset
98+
self.rules, timeout, custom_keyset, schema=Depseudonymize.schema
9699
)
97100
return result
98101

src/dapla_pseudo/v1/pseudo.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ class Pseudonymize:
2626
"""
2727

2828
dataset: pl.DataFrame
29+
schema: pd.Series | pl.Schema
2930

3031
@staticmethod
3132
def from_pandas(dataframe: pd.DataFrame) -> "Pseudonymize._Pseudonymizer":
@@ -38,6 +39,7 @@ def from_pandas(dataframe: pd.DataFrame) -> "Pseudonymize._Pseudonymizer":
3839
_Pseudonymizer: An instance of the _Pseudonymizer class.
3940
"""
4041
Pseudonymize.dataset = pl.from_pandas(dataframe)
42+
Pseudonymize.schema = dataframe.dtypes
4143
return Pseudonymize._Pseudonymizer()
4244

4345
@staticmethod
@@ -51,6 +53,7 @@ def from_polars(dataframe: pl.DataFrame) -> "Pseudonymize._Pseudonymizer":
5153
_Pseudonymizer: An instance of the _Pseudonymizer class.
5254
"""
5355
Pseudonymize.dataset = dataframe
56+
Pseudonymize.schema = dataframe.schema
5457
return Pseudonymize._Pseudonymizer()
5558

5659
class _Pseudonymizer(_BasePseudonymizer):
@@ -60,7 +63,9 @@ class _Pseudonymizer(_BasePseudonymizer):
6063
metadata: Datadoc | None = None
6164

6265
def __init__(
63-
self, rules: list[PseudoRule] | None = None, metadata: Datadoc | None = None
66+
self,
67+
rules: list[PseudoRule] | None = None,
68+
metadata: Datadoc | None = None,
6469
) -> None:
6570
"""Initialize the class."""
6671
if rules is None:
@@ -109,14 +114,17 @@ def run(
109114
hierarchical=hierarchical,
110115
user_provided_metadata=self.metadata,
111116
)
112-
113117
result = super()._execute_pseudo_operation(
114-
self.rules, timeout, custom_keyset
118+
self.rules, timeout, custom_keyset, schema=Pseudonymize.schema
115119
)
116120
return result
117121

118122
class _PseudoFuncSelector(_BaseRuleConstructor):
119-
def __init__(self, fields: list[str], metadata: Datadoc | None) -> None:
123+
def __init__(
124+
self,
125+
fields: list[str],
126+
metadata: Datadoc | None,
127+
) -> None:
120128
self._fields = fields
121129
self._metadata = metadata
122130
super().__init__(fields)

src/dapla_pseudo/v1/repseudo.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,17 +25,20 @@ class Repseudonymize:
2525
"""
2626

2727
dataset: pl.DataFrame
28+
schema: pd.Series | pl.Schema
2829

2930
@staticmethod
3031
def from_pandas(dataframe: pd.DataFrame) -> "Repseudonymize._Repseudonymizer":
3132
"""Initialize a pseudonymization request from a pandas DataFrame."""
3233
Repseudonymize.dataset = pl.from_pandas(dataframe)
34+
Repseudonymize.schema = dataframe.dtypes
3335
return Repseudonymize._Repseudonymizer()
3436

3537
@staticmethod
3638
def from_polars(dataframe: pl.DataFrame) -> "Repseudonymize._Repseudonymizer":
3739
"""Initialize a pseudonymization request from a polars DataFrame."""
3840
Repseudonymize.dataset = dataframe
41+
Repseudonymize.schema = dataframe.schema
3942
return Repseudonymize._Repseudonymizer()
4043

4144
class _Repseudonymizer(_BasePseudonymizer):
@@ -110,6 +113,7 @@ def run(
110113
custom_keyset=source_custom_keyset,
111114
target_custom_keyset=target_custom_keyset,
112115
timeout=timeout,
116+
schema=Repseudonymize.schema,
113117
)
114118
return result
115119

src/dapla_pseudo/v1/result.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,13 @@ def __init__(
3030
pseudo_operation: PseudoOperation | None = None,
3131
targeted_columns: list[str] | None = None,
3232
user_provided_metadata: Datadoc | None = None,
33+
schema: pd.Series | pl.Schema | None = None,
3334
) -> None:
3435
"""Initialise a PseudonymizationResult."""
3536
self._pseudo_data: pl.DataFrame = pseudo_response.data
3637
self._metadata: dict[str, dict[str, list[Any]]] = {}
3738
self._datadoc: Datadoc | MetadataContainer
39+
self._schema = schema
3840

3941
datadoc_fields: list[Variable] = []
4042
datadoc_paths: list[str | None] = []
@@ -144,7 +146,13 @@ def to_pandas(self, **kwargs: Any) -> pd.DataFrame:
144146
"""
145147
match self._pseudo_data:
146148
case pl.DataFrame() as df:
147-
return df.to_pandas()
149+
pandas_df = df.to_pandas()
150+
if isinstance(
151+
self._schema, pd.Series
152+
): # Apply original schema if available
153+
pandas_df = pandas_df.astype(self._schema)
154+
155+
return pandas_df
148156
case _ as invalid_pseudo_data:
149157
raise ValueError(f"Invalid response type: {type(invalid_pseudo_data)}")
150158

tests/conftest.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,22 @@ def df_personer_sid_fnr() -> pl.DataFrame:
278278
)
279279

280280

281+
@pytest_cases.fixture()
282+
def pandas_diverse_datatypes() -> pd.DataFrame:
283+
JSON_FILE = "tests/data/diverse_datatypes.json"
284+
return pd.read_json(
285+
JSON_FILE,
286+
dtype={
287+
"string_field": "string[pyarrow]",
288+
"int_field": "Int64[pyarrow]",
289+
"float_field": "Float64[pyarrow]",
290+
"date_pseudonymized": "datetime64[s]",
291+
"bool_field": "boolean[pyarrow]",
292+
},
293+
dtype_backend="pyarrow",
294+
)
295+
296+
281297
@pytest_cases.fixture()
282298
def single_field_response() -> MagicMock:
283299
mock_response = MagicMock()

tests/data/diverse_datatypes.json

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
[
2+
{
3+
"fnr": "11854898347",
4+
"number": 1,
5+
"floater": 1.0,
6+
"date_pseudonymized": "2023-10-27T10:30:00",
7+
"boolean": true
8+
},
9+
{
10+
"fnr": "11852228347",
11+
"number": 10,
12+
"floater": -1.0,
13+
"date_pseudonymized": "2013-10-27T10:30:00",
14+
"boolean": false
15+
}
16+
]

tests/v1/integration/test_result.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,3 +138,20 @@ def test_pseudonymize_input_output_funcs(
138138
case "polars":
139139
df_polars = result.to_polars()
140140
pl_assert_frame_equal(df_polars, df_personer_fnr_daead_encrypted)
141+
142+
143+
@pytest.mark.usefixtures("setup")
144+
@integration_test()
145+
def test_pseudonymize_with_arrow_dtypes(
146+
pandas_diverse_datatypes: pd.DataFrame,
147+
) -> None:
148+
"""This test ensures that datatypes are retained when converting internally to and from Pandas."""
149+
result = (
150+
Pseudonymize.from_pandas(pandas_diverse_datatypes)
151+
.on_fields("fnr")
152+
.with_default_encryption()
153+
.run()
154+
)
155+
156+
df_result = result.to_pandas()
157+
assert df_result.dtypes.equals(pandas_diverse_datatypes.dtypes)

0 commit comments

Comments
 (0)