statisticsnorway · mallport · Feb 25, 2026 · Feb 24, 2026 · Feb 24, 2026 · Feb 24, 2026
diff --git a/README.md b/README.md
@@ -28,7 +28,7 @@ Pseudonymize, repseudonymize and depseudonymize data on Dapla.
 
 ## Features
 
-Other examples can also be viewed through notebook files for [pseudo](tests/pseudo_examples.ipynb) and [depseudo](tests/depseudo_examples.ipynb)
+Other examples can also be viewed through notebook files for [pseudo](tests/pseudo_examples.ipynb) and [depseudo](tests/depseudo_examples.ipynb).
 
 ### Pseudonymize
 
@@ -100,6 +100,8 @@ Note that you may also use a Pandas DataFrame as an input or output, by exchangi
 and `to_polars` with `to_pandas`. However, Pandas is much less performant, so take special care especially if your
 dataset is large.
 
+`from_polars(...)` accepts both `pl.DataFrame` and `pl.LazyFrame`.
+
 Example:
 
 ```python
@@ -113,6 +115,33 @@ df_pandas = (
 )
 ```
 
+### Polars LazyFrame on GCS
+
+You can use `from_polars(...)` with lazy inputs, for example scanning from GCS and writing the pseudonymized output back to GCS.
+
+```python
+import os
+import polars as pl
+from dapla_pseudo import Pseudonymize
+
+bucket = os.environ["BUCKET_NAME"]
+input_path = f"gs://{bucket}/pseudo-lazy-demo/input.parquet"
+output_path = f"gs://{bucket}/pseudo-lazy-demo/output.parquet"
+
+lazy_df = pl.scan_parquet(input_path)
+
+result = (
+    Pseudonymize.from_polars(lazy_df)
+    .on_fields("person_id")
+    .with_default_encryption()
+    .run()
+)
+
+# Writes both data and datadoc metadata (__DOC.json)
+result.to_file(output_path)
+```
+
+
 
 ### Validate SID mapping
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dapla-toolbelt-pseudo"
-version = "6.0.3"
+version = "6.1.0"
 description = "Pseudonymization extensions for Dapla"
 authors = [{ name = "Dapla Developers", email = "dapla-platform-developers@ssb.no" }]
 requires-python = ">=3.11,<4.0"

diff --git a/src/dapla_pseudo/v1/baseclasses.py b/src/dapla_pseudo/v1/baseclasses.py
@@ -44,7 +44,7 @@ class _BasePseudonymizer:
     def __init__(
         self,
         pseudo_operation: PseudoOperation,
-        dataset: pl.DataFrame,
+        dataset: pl.DataFrame | pl.LazyFrame,
         hierarchical: bool,
         user_provided_metadata: Datadoc | None,
     ) -> None:

diff --git a/src/dapla_pseudo/v1/depseudo.py b/src/dapla_pseudo/v1/depseudo.py
@@ -25,7 +25,7 @@ class Depseudonymize:
     This class should not be instantiated, only the static methods should be used.
     """
 
-    dataset: pl.DataFrame
+    dataset: pl.DataFrame | pl.LazyFrame
     schema: pd.Series | pl.Schema
 
     @staticmethod
@@ -36,10 +36,16 @@ def from_pandas(dataframe: pd.DataFrame) -> "Depseudonymize._Depseudonymizer":
         return Depseudonymize._Depseudonymizer()
 
     @staticmethod
-    def from_polars(dataframe: pl.DataFrame) -> "Depseudonymize._Depseudonymizer":
+    def from_polars(
+        dataframe: pl.DataFrame | pl.LazyFrame,
+    ) -> "Depseudonymize._Depseudonymizer":
         """Initialize a depseudonymization request from a polars DataFrame."""
         Depseudonymize.dataset = dataframe
-        Depseudonymize.schema = dataframe.schema
+        Depseudonymize.schema = (
+            dataframe.schema
+            if type(dataframe) is pl.DataFrame
+            else dataframe.collect_schema()
+        )
         return Depseudonymize._Depseudonymizer()
 
     class _Depseudonymizer(_BasePseudonymizer):
@@ -86,7 +92,15 @@ def run(
 
             Returns:
                 Result: The depseudonymized dataset and the associated metadata.
+
+            Raises:
+                ValueError: If hierarchical is True and input dataset is a Polars LazyFrame.
             """
+            if hierarchical and isinstance(Depseudonymize.dataset, pl.LazyFrame):
+                raise ValueError(
+                    "Hierarchical datasets are not supported for Polars LazyFrames."
+                )
+
             super().__init__(
                 pseudo_operation=PseudoOperation.DEPSEUDONYMIZE,
                 dataset=Depseudonymize.dataset,

diff --git a/src/dapla_pseudo/v1/models/api.py b/src/dapla_pseudo/v1/models/api.py
@@ -81,5 +81,5 @@ def __add__(self, other: "RawPseudoMetadata | None") -> "RawPseudoMetadata":
 class PseudoFieldResponse:
     """PseudoFieldResponse holds the data and metadata from a Pseudo Service field response."""
 
-    data: pl.DataFrame
+    data: pl.DataFrame | pl.LazyFrame
     raw_metadata: list[RawPseudoMetadata]
diff --git a/src/dapla_pseudo/v1/mutable_dataframe.py b/src/dapla_pseudo/v1/mutable_dataframe.py
@@ -57,26 +57,43 @@ def get_value(self) -> list[str | int | None]:
 class MutableDataFrame:
     """A DataFrame that can change values in-place."""
 
-    def __init__(self, dataframe: pl.DataFrame, hierarchical: bool) -> None:
+    def __init__(
+        self, dataframe: pl.DataFrame | pl.LazyFrame, hierarchical: bool
+    ) -> None:
         """Initialize the class."""
-        self.dataset: pl.DataFrame | dict[str, Any] = dataframe
+        self.dataset: pl.DataFrame | dict[str, Any] | pl.LazyFrame = dataframe
         self.matched_fields: dict[str, FieldMatch] = {}
         self.matched_fields_metrics: dict[str, int] | None = None
         self.hierarchical: bool = hierarchical
-        self.schema = dataframe.schema
+        self.schema = (
+            dataframe.schema
+            if isinstance(dataframe, pl.DataFrame)
+            else dataframe.collect_schema()
+        )
 
     def match_rules(
         self, rules: list[PseudoRule], target_rules: list[PseudoRule] | None
     ) -> None:
         """Create references to all the columns that matches the given pseudo rules."""
         if self.hierarchical is False:
-            assert isinstance(self.dataset, pl.DataFrame)
+            assert isinstance(self.dataset, pl.DataFrame) or isinstance(
+                self.dataset, pl.LazyFrame
+            )
+
+            def extract_column_data(
+                pattern: str, dataset: pl.DataFrame | pl.LazyFrame
+            ) -> list[Any]:
+                if isinstance(dataset, pl.DataFrame):
+                    return list(dataset.get_column(pattern))
+                elif isinstance(dataset, pl.LazyFrame):
+                    return list(dataset.select(pattern).collect().to_series())
+
             self.matched_fields = {
                 str(i): FieldMatch(
                     path=rule.pattern,
                     pattern=rule.pattern,
                     indexer=[],
-                    col=list(self.dataset.get_column(rule.pattern)),
+                    col=extract_column_data(rule.pattern, self.dataset),
                     wrapped_list=False,
                     func=rule.func,
                     target_func=target_rule.func if target_rule else None,
@@ -109,7 +126,9 @@ def get_matched_fields(self) -> dict[str, FieldMatch]:
     def update(self, path: str, data: list[str | None]) -> None:
         """Update a column with the given data."""
         if self.hierarchical is False:
-            assert isinstance(self.dataset, pl.DataFrame)
+            assert isinstance(self.dataset, pl.DataFrame) or isinstance(
+                self.dataset, pl.LazyFrame
+            )
             self.dataset = self.dataset.with_columns(pl.Series(data).alias(path))
         elif (field_match := self.matched_fields.get(path)) is not None:
             assert isinstance(self.dataset, dict)
@@ -122,10 +141,12 @@ def update(self, path: str, data: list[str | None]) -> None:
                 data if field_match.wrapped_list is False else data[0]
             )
 
-    def to_polars(self) -> pl.DataFrame:
+    def to_polars(self) -> pl.DataFrame | pl.LazyFrame:
         """Convert to Polars DataFrame."""
         if self.hierarchical is False:
-            assert isinstance(self.dataset, pl.DataFrame)
+            assert isinstance(self.dataset, pl.DataFrame) or isinstance(
+                self.dataset, pl.LazyFrame
+            )
             return self.dataset
         else:
             assert isinstance(self.dataset, dict)

diff --git a/src/dapla_pseudo/v1/pseudo.py b/src/dapla_pseudo/v1/pseudo.py
@@ -25,7 +25,7 @@ class Pseudonymize:
     This class should not be instantiated, only the static methods should be used.
     """
 
-    dataset: pl.DataFrame
+    dataset: pl.DataFrame | pl.LazyFrame
     schema: pd.Series | pl.Schema
 
     @staticmethod
@@ -43,7 +43,9 @@ def from_pandas(dataframe: pd.DataFrame) -> "Pseudonymize._Pseudonymizer":
         return Pseudonymize._Pseudonymizer()
 
     @staticmethod
-    def from_polars(dataframe: pl.DataFrame) -> "Pseudonymize._Pseudonymizer":
+    def from_polars(
+        dataframe: pl.DataFrame | pl.LazyFrame,
+    ) -> "Pseudonymize._Pseudonymizer":
         """Initialize a pseudonymization request from a Polars DataFrame.
 
         Args:
@@ -53,7 +55,11 @@ def from_polars(dataframe: pl.DataFrame) -> "Pseudonymize._Pseudonymizer":
             _Pseudonymizer: An instance of the _Pseudonymizer class.
         """
         Pseudonymize.dataset = dataframe
-        Pseudonymize.schema = dataframe.schema
+        Pseudonymize.schema = (
+            dataframe.schema
+            if type(dataframe) is pl.DataFrame
+            else dataframe.collect_schema()
+        )
         return Pseudonymize._Pseudonymizer()
 
     class _Pseudonymizer(_BasePseudonymizer):
@@ -107,7 +113,15 @@ def run(
 
             Returns:
                 Result: The pseudonymized dataset and the associated metadata.
+
+            Raises:
+                ValueError: If hierarchical is True and input dataset is a Polars LazyFrame.
             """
+            if hierarchical and type(Pseudonymize.dataset) is pl.LazyFrame:
+                raise ValueError(
+                    "Hierarchical datasets are not supported for Polars LazyFrames."
+                )
+
             super().__init__(
                 pseudo_operation=PseudoOperation.PSEUDONYMIZE,
                 dataset=Pseudonymize.dataset,

diff --git a/src/dapla_pseudo/v1/repseudo.py b/src/dapla_pseudo/v1/repseudo.py
@@ -24,7 +24,7 @@ class Repseudonymize:
     This class should not be instantiated, only the static methods should be used.
     """
 
-    dataset: pl.DataFrame
+    dataset: pl.DataFrame | pl.LazyFrame
     schema: pd.Series | pl.Schema
 
     @staticmethod
@@ -35,10 +35,16 @@ def from_pandas(dataframe: pd.DataFrame) -> "Repseudonymize._Repseudonymizer":
         return Repseudonymize._Repseudonymizer()
 
     @staticmethod
-    def from_polars(dataframe: pl.DataFrame) -> "Repseudonymize._Repseudonymizer":
+    def from_polars(
+        dataframe: pl.DataFrame | pl.LazyFrame,
+    ) -> "Repseudonymize._Repseudonymizer":
         """Initialize a pseudonymization request from a polars DataFrame."""
         Repseudonymize.dataset = dataframe
-        Repseudonymize.schema = dataframe.schema
+        Repseudonymize.schema = (
+            dataframe.schema
+            if type(dataframe) is pl.DataFrame
+            else dataframe.collect_schema()
+        )
         return Repseudonymize._Repseudonymizer()
 
     class _Repseudonymizer(_BasePseudonymizer):
@@ -99,7 +105,15 @@ def run(
 
             Returns:
                 Result: The pseudonymized dataset and the associated metadata.
+
+            Raises:
+                ValueError: If hierarchical is True and input dataset is a Polars LazyFrame.
             """
+            if hierarchical and isinstance(Repseudonymize.dataset, pl.LazyFrame):
+                raise ValueError(
+                    "Hierarchical datasets are not supported for Polars LazyFrames."
+                )
+
             super().__init__(
                 pseudo_operation=PseudoOperation.REPSEUDONYMIZE,
                 dataset=Repseudonymize.dataset,

diff --git a/src/dapla_pseudo/v1/result.py b/src/dapla_pseudo/v1/result.py
@@ -18,6 +18,7 @@
 from dapla_pseudo.utils import get_file_format_from_file_name
 from dapla_pseudo.v1.models.api import PseudoFieldResponse
 from dapla_pseudo.v1.supported_file_format import write_from_df
+from dapla_pseudo.v1.supported_file_format import write_from_lazy_df
 
 
 class Result:
@@ -32,7 +33,7 @@ def __init__(
         schema: pd.Series | pl.Schema | None = None,
     ) -> None:
         """Initialise a PseudonymizationResult."""
-        self._pseudo_data: pl.DataFrame = pseudo_response.data
+        self._pseudo_data: pl.DataFrame | pl.LazyFrame = pseudo_response.data
         self._metadata: dict[str, dict[str, list[Any]]] = {}
         self._datadoc: Datadoc | list[Variable]
         self._schema = schema
@@ -122,6 +123,11 @@ def to_polars(self, **kwargs: Any) -> pl.DataFrame:
                 if "__index_level_0__" in df.columns:
                     df = df.drop("__index_level_0__")
                 return df
+            case pl.LazyFrame() as ldf:
+                df = ldf.collect()
+                if "__index_level_0__" in df.columns:
+                    df = df.drop("__index_level_0__")
+                return df
             case _ as invalid_pseudo_data:
                 raise ValueError(f"Invalid file type: {type(invalid_pseudo_data)}")
 
@@ -146,6 +152,14 @@ def to_pandas(self, **kwargs: Any) -> pd.DataFrame:
                 ):  # Apply original schema if available
                     pandas_df = pandas_df.astype(self._schema)
 
+                return pandas_df
+            case pl.LazyFrame() as ldf:
+                pandas_df = ldf.collect().to_pandas()
+                if isinstance(
+                    self._schema, pd.Series
+                ):  # Apply original schema if available
+                    pandas_df = pandas_df.astype(self._schema)
+
                 return pandas_df
             case _ as invalid_pseudo_data:
                 raise ValueError(f"Invalid response type: {type(invalid_pseudo_data)}")
@@ -169,26 +183,24 @@ def to_file(self, file_path: str, **kwargs: Any) -> None:
         datadoc_file_path: Path | GSPath
         if file_path.startswith(GSPath.cloud_prefix):
             client = GSClient()
-            gs_path = GSPath(cloud_path=file_path, client=client)
+            cloud_path = GSPath(cloud_path=file_path, client=client)
 
-            file_handle = gs_path.open(mode="wb")
-
-            datadoc_file_path = gs_path.parent.joinpath(Path(datadoc_file_name))
+            datadoc_file_path = cloud_path.parent.joinpath(Path(datadoc_file_name))
             datadoc_file_handle = datadoc_file_path.open(mode="w")
         else:
-            file_handle = Path(file_path).open(mode="wb")
-
             datadoc_file_path = Path(file_path).parent.joinpath(Path(datadoc_file_name))
             datadoc_file_handle = datadoc_file_path.open(mode="w")
 
         match self._pseudo_data:
             case pl.DataFrame() as df:
-                write_from_df(df, file_format, file_handle, **kwargs)
+                write_from_df(df, file_format, file_path, **kwargs)
+                datadoc_file_handle.write(self.datadoc)
+            case pl.LazyFrame() as ldf:
+                write_from_lazy_df(ldf, file_format, file_path, **kwargs)
                 datadoc_file_handle.write(self.datadoc)
             case _ as invalid_pseudo_data:
                 raise ValueError(f"Invalid response type: {type(invalid_pseudo_data)}")
 
-        file_handle.close()
         datadoc_file_handle.close()
 
     @property