Skip to content

Commit 2388b78

Browse files
authored
Add option for lazily evaluating data (#491)
* Add option for lazily evaluating data * fixup * Add README example * make mypy happy * bumper * incorporate improvements * fix tests * fixer * Remove ugly env methods * Actually test lazyframe vs dataframe * fix names and add docstrings * Upgrade to 100 rows
1 parent 3151539 commit 2388b78

22 files changed

+2683
-2179
lines changed

README.md

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ Pseudonymize, repseudonymize and depseudonymize data on Dapla.
2828

2929
## Features
3030

31-
Other examples can also be viewed through notebook files for [pseudo](tests/pseudo_examples.ipynb) and [depseudo](tests/depseudo_examples.ipynb)
31+
Other examples can also be viewed through notebook files for [pseudo](tests/pseudo_examples.ipynb) and [depseudo](tests/depseudo_examples.ipynb).
3232

3333
### Pseudonymize
3434

@@ -100,6 +100,8 @@ Note that you may also use a Pandas DataFrame as an input or output, by exchangi
100100
and `to_polars` with `to_pandas`. However, Pandas is much less performant, so take special care especially if your
101101
dataset is large.
102102

103+
`from_polars(...)` accepts both `pl.DataFrame` and `pl.LazyFrame`.
104+
103105
Example:
104106

105107
```python
@@ -113,6 +115,33 @@ df_pandas = (
113115
)
114116
```
115117

118+
### Polars LazyFrame on GCS
119+
120+
You can use `from_polars(...)` with lazy inputs, for example scanning from GCS and writing the pseudonymized output back to GCS.
121+
122+
```python
123+
import os
124+
import polars as pl
125+
from dapla_pseudo import Pseudonymize
126+
127+
bucket = os.environ["BUCKET_NAME"]
128+
input_path = f"gs://{bucket}/pseudo-lazy-demo/input.parquet"
129+
output_path = f"gs://{bucket}/pseudo-lazy-demo/output.parquet"
130+
131+
lazy_df = pl.scan_parquet(input_path)
132+
133+
result = (
134+
Pseudonymize.from_polars(lazy_df)
135+
.on_fields("person_id")
136+
.with_default_encryption()
137+
.run()
138+
)
139+
140+
# Writes both data and datadoc metadata (__DOC.json)
141+
result.to_file(output_path)
142+
```
143+
144+
116145

117146
### Validate SID mapping
118147

examples/pseudo_examples.ipynb

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -298,18 +298,40 @@
298298
"result.to_polars().head()"
299299
]
300300
},
301+
{
302+
"cell_type": "markdown",
303+
"id": "95003fcf",
304+
"metadata": {},
305+
"source": [
306+
"# Case: Stream data from file to file"
307+
]
308+
},
301309
{
302310
"cell_type": "code",
303311
"execution_count": null,
304312
"id": "e5dfac47967eb179",
305313
"metadata": {},
306314
"outputs": [],
307-
"source": []
315+
"source": [
316+
"import polars as pl\n",
317+
"\n",
318+
"lazy_df = pl.scan_parquet(\"input_file.parquet\")\n",
319+
"\n",
320+
"result = (\n",
321+
" Pseudonymize.from_polars(lazy_df)\n",
322+
" .on_fields(\"person_id\")\n",
323+
" .with_default_encryption()\n",
324+
" .run()\n",
325+
")\n",
326+
"\n",
327+
"# Writes both data and datadoc metadata (__DOC.json)\n",
328+
"result.to_file(\"output_file.parquet\")"
329+
]
308330
}
309331
],
310332
"metadata": {
311333
"kernelspec": {
312-
"display_name": ".venv",
334+
"display_name": "dapla-toolbelt-pseudo",
313335
"language": "python",
314336
"name": "python3"
315337
},
@@ -323,7 +345,7 @@
323345
"name": "python",
324346
"nbconvert_exporter": "python",
325347
"pygments_lexer": "ipython3",
326-
"version": "3.12.7"
348+
"version": "3.13.1"
327349
}
328350
},
329351
"nbformat": 4,

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "dapla-toolbelt-pseudo"
3-
version = "6.0.3"
3+
version = "6.1.0"
44
description = "Pseudonymization extensions for Dapla"
55
authors = [{ name = "Dapla Developers", email = "dapla-platform-developers@ssb.no" }]
66
requires-python = ">=3.11,<4.0"

src/dapla_pseudo/v1/baseclasses.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ class _BasePseudonymizer:
4444
def __init__(
4545
self,
4646
pseudo_operation: PseudoOperation,
47-
dataset: pl.DataFrame,
47+
dataset: pl.DataFrame | pl.LazyFrame,
4848
hierarchical: bool,
4949
user_provided_metadata: Datadoc | None,
5050
) -> None:

src/dapla_pseudo/v1/depseudo.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ class Depseudonymize:
2525
This class should not be instantiated, only the static methods should be used.
2626
"""
2727

28-
dataset: pl.DataFrame
28+
dataset: pl.DataFrame | pl.LazyFrame
2929
schema: pd.Series | pl.Schema
3030

3131
@staticmethod
@@ -36,10 +36,16 @@ def from_pandas(dataframe: pd.DataFrame) -> "Depseudonymize._Depseudonymizer":
3636
return Depseudonymize._Depseudonymizer()
3737

3838
@staticmethod
39-
def from_polars(dataframe: pl.DataFrame) -> "Depseudonymize._Depseudonymizer":
39+
def from_polars(
40+
dataframe: pl.DataFrame | pl.LazyFrame,
41+
) -> "Depseudonymize._Depseudonymizer":
4042
"""Initialize a depseudonymization request from a polars DataFrame."""
4143
Depseudonymize.dataset = dataframe
42-
Depseudonymize.schema = dataframe.schema
44+
Depseudonymize.schema = (
45+
dataframe.schema
46+
if type(dataframe) is pl.DataFrame
47+
else dataframe.collect_schema()
48+
)
4349
return Depseudonymize._Depseudonymizer()
4450

4551
class _Depseudonymizer(_BasePseudonymizer):
@@ -86,7 +92,15 @@ def run(
8692
8793
Returns:
8894
Result: The depseudonymized dataset and the associated metadata.
95+
96+
Raises:
97+
ValueError: If hierarchical is True and input dataset is a Polars LazyFrame.
8998
"""
99+
if hierarchical and isinstance(Depseudonymize.dataset, pl.LazyFrame):
100+
raise ValueError(
101+
"Hierarchical datasets are not supported for Polars LazyFrames."
102+
)
103+
90104
super().__init__(
91105
pseudo_operation=PseudoOperation.DEPSEUDONYMIZE,
92106
dataset=Depseudonymize.dataset,

src/dapla_pseudo/v1/models/api.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,5 +81,5 @@ def __add__(self, other: "RawPseudoMetadata | None") -> "RawPseudoMetadata":
8181
class PseudoFieldResponse:
8282
"""PseudoFieldResponse holds the data and metadata from a Pseudo Service field response."""
8383

84-
data: pl.DataFrame
84+
data: pl.DataFrame | pl.LazyFrame
8585
raw_metadata: list[RawPseudoMetadata]

src/dapla_pseudo/v1/mutable_dataframe.py

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -57,26 +57,43 @@ def get_value(self) -> list[str | int | None]:
5757
class MutableDataFrame:
5858
"""A DataFrame that can change values in-place."""
5959

60-
def __init__(self, dataframe: pl.DataFrame, hierarchical: bool) -> None:
60+
def __init__(
61+
self, dataframe: pl.DataFrame | pl.LazyFrame, hierarchical: bool
62+
) -> None:
6163
"""Initialize the class."""
62-
self.dataset: pl.DataFrame | dict[str, Any] = dataframe
64+
self.dataset: pl.DataFrame | dict[str, Any] | pl.LazyFrame = dataframe
6365
self.matched_fields: dict[str, FieldMatch] = {}
6466
self.matched_fields_metrics: dict[str, int] | None = None
6567
self.hierarchical: bool = hierarchical
66-
self.schema = dataframe.schema
68+
self.schema = (
69+
dataframe.schema
70+
if isinstance(dataframe, pl.DataFrame)
71+
else dataframe.collect_schema()
72+
)
6773

6874
def match_rules(
6975
self, rules: list[PseudoRule], target_rules: list[PseudoRule] | None
7076
) -> None:
7177
"""Create references to all the columns that matches the given pseudo rules."""
7278
if self.hierarchical is False:
73-
assert isinstance(self.dataset, pl.DataFrame)
79+
assert isinstance(self.dataset, pl.DataFrame) or isinstance(
80+
self.dataset, pl.LazyFrame
81+
)
82+
83+
def extract_column_data(
84+
pattern: str, dataset: pl.DataFrame | pl.LazyFrame
85+
) -> list[Any]:
86+
if isinstance(dataset, pl.DataFrame):
87+
return list(dataset.get_column(pattern))
88+
elif isinstance(dataset, pl.LazyFrame):
89+
return list(dataset.select(pattern).collect().to_series())
90+
7491
self.matched_fields = {
7592
str(i): FieldMatch(
7693
path=rule.pattern,
7794
pattern=rule.pattern,
7895
indexer=[],
79-
col=list(self.dataset.get_column(rule.pattern)),
96+
col=extract_column_data(rule.pattern, self.dataset),
8097
wrapped_list=False,
8198
func=rule.func,
8299
target_func=target_rule.func if target_rule else None,
@@ -109,7 +126,9 @@ def get_matched_fields(self) -> dict[str, FieldMatch]:
109126
def update(self, path: str, data: list[str | None]) -> None:
110127
"""Update a column with the given data."""
111128
if self.hierarchical is False:
112-
assert isinstance(self.dataset, pl.DataFrame)
129+
assert isinstance(self.dataset, pl.DataFrame) or isinstance(
130+
self.dataset, pl.LazyFrame
131+
)
113132
self.dataset = self.dataset.with_columns(pl.Series(data).alias(path))
114133
elif (field_match := self.matched_fields.get(path)) is not None:
115134
assert isinstance(self.dataset, dict)
@@ -122,10 +141,12 @@ def update(self, path: str, data: list[str | None]) -> None:
122141
data if field_match.wrapped_list is False else data[0]
123142
)
124143

125-
def to_polars(self) -> pl.DataFrame:
144+
def to_polars(self) -> pl.DataFrame | pl.LazyFrame:
126145
"""Convert to Polars DataFrame."""
127146
if self.hierarchical is False:
128-
assert isinstance(self.dataset, pl.DataFrame)
147+
assert isinstance(self.dataset, pl.DataFrame) or isinstance(
148+
self.dataset, pl.LazyFrame
149+
)
129150
return self.dataset
130151
else:
131152
assert isinstance(self.dataset, dict)

src/dapla_pseudo/v1/pseudo.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ class Pseudonymize:
2525
This class should not be instantiated, only the static methods should be used.
2626
"""
2727

28-
dataset: pl.DataFrame
28+
dataset: pl.DataFrame | pl.LazyFrame
2929
schema: pd.Series | pl.Schema
3030

3131
@staticmethod
@@ -43,7 +43,9 @@ def from_pandas(dataframe: pd.DataFrame) -> "Pseudonymize._Pseudonymizer":
4343
return Pseudonymize._Pseudonymizer()
4444

4545
@staticmethod
46-
def from_polars(dataframe: pl.DataFrame) -> "Pseudonymize._Pseudonymizer":
46+
def from_polars(
47+
dataframe: pl.DataFrame | pl.LazyFrame,
48+
) -> "Pseudonymize._Pseudonymizer":
4749
"""Initialize a pseudonymization request from a Polars DataFrame.
4850
4951
Args:
@@ -53,7 +55,11 @@ def from_polars(dataframe: pl.DataFrame) -> "Pseudonymize._Pseudonymizer":
5355
_Pseudonymizer: An instance of the _Pseudonymizer class.
5456
"""
5557
Pseudonymize.dataset = dataframe
56-
Pseudonymize.schema = dataframe.schema
58+
Pseudonymize.schema = (
59+
dataframe.schema
60+
if type(dataframe) is pl.DataFrame
61+
else dataframe.collect_schema()
62+
)
5763
return Pseudonymize._Pseudonymizer()
5864

5965
class _Pseudonymizer(_BasePseudonymizer):
@@ -107,7 +113,15 @@ def run(
107113
108114
Returns:
109115
Result: The pseudonymized dataset and the associated metadata.
116+
117+
Raises:
118+
ValueError: If hierarchical is True and input dataset is a Polars LazyFrame.
110119
"""
120+
if hierarchical and type(Pseudonymize.dataset) is pl.LazyFrame:
121+
raise ValueError(
122+
"Hierarchical datasets are not supported for Polars LazyFrames."
123+
)
124+
111125
super().__init__(
112126
pseudo_operation=PseudoOperation.PSEUDONYMIZE,
113127
dataset=Pseudonymize.dataset,

src/dapla_pseudo/v1/repseudo.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ class Repseudonymize:
2424
This class should not be instantiated, only the static methods should be used.
2525
"""
2626

27-
dataset: pl.DataFrame
27+
dataset: pl.DataFrame | pl.LazyFrame
2828
schema: pd.Series | pl.Schema
2929

3030
@staticmethod
@@ -35,10 +35,16 @@ def from_pandas(dataframe: pd.DataFrame) -> "Repseudonymize._Repseudonymizer":
3535
return Repseudonymize._Repseudonymizer()
3636

3737
@staticmethod
38-
def from_polars(dataframe: pl.DataFrame) -> "Repseudonymize._Repseudonymizer":
38+
def from_polars(
39+
dataframe: pl.DataFrame | pl.LazyFrame,
40+
) -> "Repseudonymize._Repseudonymizer":
3941
"""Initialize a pseudonymization request from a polars DataFrame."""
4042
Repseudonymize.dataset = dataframe
41-
Repseudonymize.schema = dataframe.schema
43+
Repseudonymize.schema = (
44+
dataframe.schema
45+
if type(dataframe) is pl.DataFrame
46+
else dataframe.collect_schema()
47+
)
4248
return Repseudonymize._Repseudonymizer()
4349

4450
class _Repseudonymizer(_BasePseudonymizer):
@@ -99,7 +105,15 @@ def run(
99105
100106
Returns:
101107
Result: The pseudonymized dataset and the associated metadata.
108+
109+
Raises:
110+
ValueError: If hierarchical is True and input dataset is a Polars LazyFrame.
102111
"""
112+
if hierarchical and isinstance(Repseudonymize.dataset, pl.LazyFrame):
113+
raise ValueError(
114+
"Hierarchical datasets are not supported for Polars LazyFrames."
115+
)
116+
103117
super().__init__(
104118
pseudo_operation=PseudoOperation.REPSEUDONYMIZE,
105119
dataset=Repseudonymize.dataset,

0 commit comments

Comments
 (0)