Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 30 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ Pseudonymize, repseudonymize and depseudonymize data on Dapla.

## Features

Other examples can also be viewed through notebook files for [pseudo](tests/pseudo_examples.ipynb) and [depseudo](tests/depseudo_examples.ipynb)
Other examples can also be viewed through notebook files for [pseudo](tests/pseudo_examples.ipynb) and [depseudo](tests/depseudo_examples.ipynb).

### Pseudonymize

Expand Down Expand Up @@ -100,6 +100,8 @@ Note that you may also use a Pandas DataFrame as an input or output, by exchangi
and `to_polars` with `to_pandas`. However, Pandas is much less performant, so take special care especially if your
dataset is large.

`from_polars(...)` accepts both `pl.DataFrame` and `pl.LazyFrame`.

Example:

```python
Expand All @@ -113,6 +115,33 @@ df_pandas = (
)
```

### Polars LazyFrame on GCS

You can use `from_polars(...)` with lazy inputs, for example scanning from GCS and writing the pseudonymized output back to GCS.

```python
import os
import polars as pl
from dapla_pseudo import Pseudonymize

bucket = os.environ["BUCKET_NAME"]
input_path = f"gs://{bucket}/pseudo-lazy-demo/input.parquet"
output_path = f"gs://{bucket}/pseudo-lazy-demo/output.parquet"

lazy_df = pl.scan_parquet(input_path)

result = (
Pseudonymize.from_polars(lazy_df)
.on_fields("person_id")
.with_default_encryption()
.run()
)

# Writes both data and datadoc metadata (__DOC.json)
result.to_file(output_path)
```



### Validate SID mapping

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "dapla-toolbelt-pseudo"
version = "6.0.3"
version = "6.1.0"
description = "Pseudonymization extensions for Dapla"
authors = [{ name = "Dapla Developers", email = "dapla-platform-developers@ssb.no" }]
requires-python = ">=3.11,<4.0"
Expand Down
2 changes: 1 addition & 1 deletion src/dapla_pseudo/v1/baseclasses.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class _BasePseudonymizer:
def __init__(
self,
pseudo_operation: PseudoOperation,
dataset: pl.DataFrame,
dataset: pl.DataFrame | pl.LazyFrame,
hierarchical: bool,
user_provided_metadata: Datadoc | None,
) -> None:
Expand Down
20 changes: 17 additions & 3 deletions src/dapla_pseudo/v1/depseudo.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class Depseudonymize:
This class should not be instantiated, only the static methods should be used.
"""

dataset: pl.DataFrame
dataset: pl.DataFrame | pl.LazyFrame
schema: pd.Series | pl.Schema

@staticmethod
Expand All @@ -36,10 +36,16 @@ def from_pandas(dataframe: pd.DataFrame) -> "Depseudonymize._Depseudonymizer":
return Depseudonymize._Depseudonymizer()

@staticmethod
def from_polars(dataframe: pl.DataFrame) -> "Depseudonymize._Depseudonymizer":
def from_polars(
dataframe: pl.DataFrame | pl.LazyFrame,
) -> "Depseudonymize._Depseudonymizer":
"""Initialize a depseudonymization request from a polars DataFrame."""
Depseudonymize.dataset = dataframe
Depseudonymize.schema = dataframe.schema
Depseudonymize.schema = (
dataframe.schema
if type(dataframe) is pl.DataFrame
else dataframe.collect_schema()
)
return Depseudonymize._Depseudonymizer()

class _Depseudonymizer(_BasePseudonymizer):
Expand Down Expand Up @@ -86,7 +92,15 @@ def run(

Returns:
Result: The depseudonymized dataset and the associated metadata.

Raises:
ValueError: If hierarchical is True and input dataset is a Polars LazyFrame.
"""
if hierarchical and isinstance(Depseudonymize.dataset, pl.LazyFrame):
raise ValueError(
"Hierarchical datasets are not supported for Polars LazyFrames."
)

super().__init__(
pseudo_operation=PseudoOperation.DEPSEUDONYMIZE,
dataset=Depseudonymize.dataset,
Expand Down
2 changes: 1 addition & 1 deletion src/dapla_pseudo/v1/models/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,5 +81,5 @@ def __add__(self, other: "RawPseudoMetadata | None") -> "RawPseudoMetadata":
class PseudoFieldResponse:
"""PseudoFieldResponse holds the data and metadata from a Pseudo Service field response."""

data: pl.DataFrame
data: pl.DataFrame | pl.LazyFrame
raw_metadata: list[RawPseudoMetadata]
37 changes: 29 additions & 8 deletions src/dapla_pseudo/v1/mutable_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,26 +57,43 @@ def get_value(self) -> list[str | int | None]:
class MutableDataFrame:
"""A DataFrame that can change values in-place."""

def __init__(self, dataframe: pl.DataFrame, hierarchical: bool) -> None:
def __init__(
self, dataframe: pl.DataFrame | pl.LazyFrame, hierarchical: bool
) -> None:
"""Initialize the class."""
self.dataset: pl.DataFrame | dict[str, Any] = dataframe
self.dataset: pl.DataFrame | dict[str, Any] | pl.LazyFrame = dataframe
self.matched_fields: dict[str, FieldMatch] = {}
self.matched_fields_metrics: dict[str, int] | None = None
self.hierarchical: bool = hierarchical
self.schema = dataframe.schema
self.schema = (
dataframe.schema
if isinstance(dataframe, pl.DataFrame)
else dataframe.collect_schema()
)

def match_rules(
self, rules: list[PseudoRule], target_rules: list[PseudoRule] | None
) -> None:
"""Create references to all the columns that matches the given pseudo rules."""
if self.hierarchical is False:
assert isinstance(self.dataset, pl.DataFrame)
assert isinstance(self.dataset, pl.DataFrame) or isinstance(
self.dataset, pl.LazyFrame
)

def extract_column_data(
pattern: str, dataset: pl.DataFrame | pl.LazyFrame
) -> list[Any]:
if isinstance(dataset, pl.DataFrame):
return list(dataset.get_column(pattern))
elif isinstance(dataset, pl.LazyFrame):
return list(dataset.select(pattern).collect().to_series())

self.matched_fields = {
str(i): FieldMatch(
path=rule.pattern,
pattern=rule.pattern,
indexer=[],
col=list(self.dataset.get_column(rule.pattern)),
col=extract_column_data(rule.pattern, self.dataset),
wrapped_list=False,
func=rule.func,
target_func=target_rule.func if target_rule else None,
Expand Down Expand Up @@ -109,7 +126,9 @@ def get_matched_fields(self) -> dict[str, FieldMatch]:
def update(self, path: str, data: list[str | None]) -> None:
"""Update a column with the given data."""
if self.hierarchical is False:
assert isinstance(self.dataset, pl.DataFrame)
assert isinstance(self.dataset, pl.DataFrame) or isinstance(
self.dataset, pl.LazyFrame
)
self.dataset = self.dataset.with_columns(pl.Series(data).alias(path))
elif (field_match := self.matched_fields.get(path)) is not None:
assert isinstance(self.dataset, dict)
Expand All @@ -122,10 +141,12 @@ def update(self, path: str, data: list[str | None]) -> None:
data if field_match.wrapped_list is False else data[0]
)

def to_polars(self) -> pl.DataFrame:
def to_polars(self) -> pl.DataFrame | pl.LazyFrame:
"""Convert to Polars DataFrame."""
if self.hierarchical is False:
assert isinstance(self.dataset, pl.DataFrame)
assert isinstance(self.dataset, pl.DataFrame) or isinstance(
self.dataset, pl.LazyFrame
)
return self.dataset
else:
assert isinstance(self.dataset, dict)
Expand Down
20 changes: 17 additions & 3 deletions src/dapla_pseudo/v1/pseudo.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class Pseudonymize:
This class should not be instantiated, only the static methods should be used.
"""

dataset: pl.DataFrame
dataset: pl.DataFrame | pl.LazyFrame
schema: pd.Series | pl.Schema

@staticmethod
Expand All @@ -43,7 +43,9 @@ def from_pandas(dataframe: pd.DataFrame) -> "Pseudonymize._Pseudonymizer":
return Pseudonymize._Pseudonymizer()

@staticmethod
def from_polars(dataframe: pl.DataFrame) -> "Pseudonymize._Pseudonymizer":
def from_polars(
dataframe: pl.DataFrame | pl.LazyFrame,
) -> "Pseudonymize._Pseudonymizer":
"""Initialize a pseudonymization request from a Polars DataFrame.

Args:
Expand All @@ -53,7 +55,11 @@ def from_polars(dataframe: pl.DataFrame) -> "Pseudonymize._Pseudonymizer":
_Pseudonymizer: An instance of the _Pseudonymizer class.
"""
Pseudonymize.dataset = dataframe
Pseudonymize.schema = dataframe.schema
Pseudonymize.schema = (
dataframe.schema
if type(dataframe) is pl.DataFrame
else dataframe.collect_schema()
)
return Pseudonymize._Pseudonymizer()

class _Pseudonymizer(_BasePseudonymizer):
Expand Down Expand Up @@ -107,7 +113,15 @@ def run(

Returns:
Result: The pseudonymized dataset and the associated metadata.

Raises:
ValueError: If hierarchical is True and input dataset is a Polars LazyFrame.
"""
if hierarchical and type(Pseudonymize.dataset) is pl.LazyFrame:
raise ValueError(
"Hierarchical datasets are not supported for Polars LazyFrames."
)

super().__init__(
pseudo_operation=PseudoOperation.PSEUDONYMIZE,
dataset=Pseudonymize.dataset,
Expand Down
20 changes: 17 additions & 3 deletions src/dapla_pseudo/v1/repseudo.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class Repseudonymize:
This class should not be instantiated, only the static methods should be used.
"""

dataset: pl.DataFrame
dataset: pl.DataFrame | pl.LazyFrame
schema: pd.Series | pl.Schema

@staticmethod
Expand All @@ -35,10 +35,16 @@ def from_pandas(dataframe: pd.DataFrame) -> "Repseudonymize._Repseudonymizer":
return Repseudonymize._Repseudonymizer()

@staticmethod
def from_polars(dataframe: pl.DataFrame) -> "Repseudonymize._Repseudonymizer":
def from_polars(
dataframe: pl.DataFrame | pl.LazyFrame,
) -> "Repseudonymize._Repseudonymizer":
"""Initialize a pseudonymization request from a polars DataFrame."""
Repseudonymize.dataset = dataframe
Repseudonymize.schema = dataframe.schema
Repseudonymize.schema = (
dataframe.schema
if type(dataframe) is pl.DataFrame
else dataframe.collect_schema()
)
return Repseudonymize._Repseudonymizer()

class _Repseudonymizer(_BasePseudonymizer):
Expand Down Expand Up @@ -99,7 +105,15 @@ def run(

Returns:
Result: The pseudonymized dataset and the associated metadata.

Raises:
ValueError: If hierarchical is True and input dataset is a Polars LazyFrame.
"""
if hierarchical and isinstance(Repseudonymize.dataset, pl.LazyFrame):
raise ValueError(
"Hierarchical datasets are not supported for Polars LazyFrames."
)

super().__init__(
pseudo_operation=PseudoOperation.REPSEUDONYMIZE,
dataset=Repseudonymize.dataset,
Expand Down
30 changes: 21 additions & 9 deletions src/dapla_pseudo/v1/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from dapla_pseudo.utils import get_file_format_from_file_name
from dapla_pseudo.v1.models.api import PseudoFieldResponse
from dapla_pseudo.v1.supported_file_format import write_from_df
from dapla_pseudo.v1.supported_file_format import write_from_lazy_df


class Result:
Expand All @@ -32,7 +33,7 @@ def __init__(
schema: pd.Series | pl.Schema | None = None,
) -> None:
"""Initialise a PseudonymizationResult."""
self._pseudo_data: pl.DataFrame = pseudo_response.data
self._pseudo_data: pl.DataFrame | pl.LazyFrame = pseudo_response.data
self._metadata: dict[str, dict[str, list[Any]]] = {}
self._datadoc: Datadoc | list[Variable]
self._schema = schema
Expand Down Expand Up @@ -122,6 +123,11 @@ def to_polars(self, **kwargs: Any) -> pl.DataFrame:
if "__index_level_0__" in df.columns:
df = df.drop("__index_level_0__")
return df
case pl.LazyFrame() as ldf:
df = ldf.collect()
if "__index_level_0__" in df.columns:
df = df.drop("__index_level_0__")
return df
case _ as invalid_pseudo_data:
raise ValueError(f"Invalid file type: {type(invalid_pseudo_data)}")

Expand All @@ -146,6 +152,14 @@ def to_pandas(self, **kwargs: Any) -> pd.DataFrame:
): # Apply original schema if available
pandas_df = pandas_df.astype(self._schema)

return pandas_df
case pl.LazyFrame() as ldf:
pandas_df = ldf.collect().to_pandas()
if isinstance(
self._schema, pd.Series
): # Apply original schema if available
pandas_df = pandas_df.astype(self._schema)

return pandas_df
case _ as invalid_pseudo_data:
raise ValueError(f"Invalid response type: {type(invalid_pseudo_data)}")
Expand All @@ -169,26 +183,24 @@ def to_file(self, file_path: str, **kwargs: Any) -> None:
datadoc_file_path: Path | GSPath
if file_path.startswith(GSPath.cloud_prefix):
client = GSClient()
gs_path = GSPath(cloud_path=file_path, client=client)
cloud_path = GSPath(cloud_path=file_path, client=client)

file_handle = gs_path.open(mode="wb")
Copy link
Contributor Author

@mallport mallport Feb 24, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After we upgraded Polars, we no longer have to use file handles for writing to GCS, we can just supply the "gs://"-filepath and Polars automatically infers the authentication from the environment


datadoc_file_path = gs_path.parent.joinpath(Path(datadoc_file_name))
datadoc_file_path = cloud_path.parent.joinpath(Path(datadoc_file_name))
datadoc_file_handle = datadoc_file_path.open(mode="w")
else:
file_handle = Path(file_path).open(mode="wb")

datadoc_file_path = Path(file_path).parent.joinpath(Path(datadoc_file_name))
datadoc_file_handle = datadoc_file_path.open(mode="w")

match self._pseudo_data:
case pl.DataFrame() as df:
write_from_df(df, file_format, file_handle, **kwargs)
write_from_df(df, file_format, file_path, **kwargs)
datadoc_file_handle.write(self.datadoc)
case pl.LazyFrame() as ldf:
write_from_lazy_df(ldf, file_format, file_path, **kwargs)
datadoc_file_handle.write(self.datadoc)
case _ as invalid_pseudo_data:
raise ValueError(f"Invalid response type: {type(invalid_pseudo_data)}")

file_handle.close()
datadoc_file_handle.close()

@property
Expand Down
Loading