statisticsnorway
diff --git a/‎README.md‎
Lines changed: 30 additions & 1 deletion b/‎README.md‎
Lines changed: 30 additions & 1 deletion
diff --git a/‎examples/pseudo_examples.ipynb‎
Lines changed: 25 additions & 3 deletions b/‎examples/pseudo_examples.ipynb‎
Lines changed: 25 additions & 3 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/dapla_pseudo/v1/baseclasses.py‎
Lines changed: 1 addition & 1 deletion b/‎src/dapla_pseudo/v1/baseclasses.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/dapla_pseudo/v1/depseudo.py‎
Lines changed: 17 additions & 3 deletions b/‎src/dapla_pseudo/v1/depseudo.py‎
Lines changed: 17 additions & 3 deletions
diff --git a/‎src/dapla_pseudo/v1/models/api.py‎
Lines changed: 1 addition & 1 deletion b/‎src/dapla_pseudo/v1/models/api.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/dapla_pseudo/v1/mutable_dataframe.py‎
Lines changed: 29 additions & 8 deletions b/‎src/dapla_pseudo/v1/mutable_dataframe.py‎
Lines changed: 29 additions & 8 deletions
diff --git a/‎src/dapla_pseudo/v1/pseudo.py‎
Lines changed: 17 additions & 3 deletions b/‎src/dapla_pseudo/v1/pseudo.py‎
Lines changed: 17 additions & 3 deletions
diff --git a/‎src/dapla_pseudo/v1/repseudo.py‎
Lines changed: 17 additions & 3 deletions b/‎src/dapla_pseudo/v1/repseudo.py‎
Lines changed: 17 additions & 3 deletions
@@ -28,7 +28,7 @@ Pseudonymize, repseudonymize and depseudonymize data on Dapla.
 
 ## Features
 
-Other examples can also be viewed through notebook files for [pseudo](tests/pseudo_examples.ipynb) and [depseudo](tests/depseudo_examples.ipynb)
+Other examples can also be viewed through notebook files for [pseudo](tests/pseudo_examples.ipynb) and [depseudo](tests/depseudo_examples.ipynb).
 
 ### Pseudonymize
 
@@ -100,6 +100,8 @@ Note that you may also use a Pandas DataFrame as an input or output, by exchangi
 and `to_polars` with `to_pandas`. However, Pandas is much less performant, so take special care especially if your
 dataset is large.
 
+`from_polars(...)` accepts both `pl.DataFrame` and `pl.LazyFrame`.
+
 Example:
 
 ```python
@@ -113,6 +115,33 @@ df_pandas = (
 )
 ```
 
+### Polars LazyFrame on GCS
+
+You can use `from_polars(...)` with lazy inputs, for example scanning from GCS and writing the pseudonymized output back to GCS.
+
+```python
+import os
+import polars as pl
+from dapla_pseudo import Pseudonymize
+
+bucket = os.environ["BUCKET_NAME"]
+input_path = f"gs://{bucket}/pseudo-lazy-demo/input.parquet"
+output_path = f"gs://{bucket}/pseudo-lazy-demo/output.parquet"
+
+lazy_df = pl.scan_parquet(input_path)
+
+result = (
+    Pseudonymize.from_polars(lazy_df)
+    .on_fields("person_id")
+    .with_default_encryption()
+    .run()
+)
+
+# Writes both data and datadoc metadata (__DOC.json)
+result.to_file(output_path)
+```
+
+
 
 ### Validate SID mapping
 
 
@@ -298,18 +298,40 @@
     "result.to_polars().head()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "95003fcf",
+   "metadata": {},
+   "source": [
+    "# Case: Stream data from file to file"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "e5dfac47967eb179",
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "import polars as pl\n",
+    "\n",
+    "lazy_df = pl.scan_parquet(\"input_file.parquet\")\n",
+    "\n",
+    "result = (\n",
+    "    Pseudonymize.from_polars(lazy_df)\n",
+    "    .on_fields(\"person_id\")\n",
+    "    .with_default_encryption()\n",
+    "    .run()\n",
+    ")\n",
+    "\n",
+    "# Writes both data and datadoc metadata (__DOC.json)\n",
+    "result.to_file(\"output_file.parquet\")"
+   ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": ".venv",
+   "display_name": "dapla-toolbelt-pseudo",
    "language": "python",
    "name": "python3"
   },
@@ -323,7 +345,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.7"
+   "version": "3.13.1"
   }
  },
  "nbformat": 4,
 
@@ -1,6 +1,6 @@
 [project]
 name = "dapla-toolbelt-pseudo"
-version = "6.0.3"
+version = "6.1.0"
 description = "Pseudonymization extensions for Dapla"
 authors = [{ name = "Dapla Developers", email = "dapla-platform-developers@ssb.no" }]
 requires-python = ">=3.11,<4.0"
 
@@ -44,7 +44,7 @@ class _BasePseudonymizer:
     def __init__(
         self,
         pseudo_operation: PseudoOperation,
-        dataset: pl.DataFrame,
+        dataset: pl.DataFrame | pl.LazyFrame,
         hierarchical: bool,
         user_provided_metadata: Datadoc | None,
     ) -> None:
 
@@ -25,7 +25,7 @@ class Depseudonymize:
     This class should not be instantiated, only the static methods should be used.
     """
 
-    dataset: pl.DataFrame
+    dataset: pl.DataFrame | pl.LazyFrame
     schema: pd.Series | pl.Schema
 
     @staticmethod
@@ -36,10 +36,16 @@ def from_pandas(dataframe: pd.DataFrame) -> "Depseudonymize._Depseudonymizer":
         return Depseudonymize._Depseudonymizer()
 
     @staticmethod
-    def from_polars(dataframe: pl.DataFrame) -> "Depseudonymize._Depseudonymizer":
+    def from_polars(
+        dataframe: pl.DataFrame | pl.LazyFrame,
+    ) -> "Depseudonymize._Depseudonymizer":
         """Initialize a depseudonymization request from a polars DataFrame."""
         Depseudonymize.dataset = dataframe
-        Depseudonymize.schema = dataframe.schema
+        Depseudonymize.schema = (
+            dataframe.schema
+            if type(dataframe) is pl.DataFrame
+            else dataframe.collect_schema()
+        )
         return Depseudonymize._Depseudonymizer()
 
     class _Depseudonymizer(_BasePseudonymizer):
@@ -86,7 +92,15 @@ def run(
 
             Returns:
                 Result: The depseudonymized dataset and the associated metadata.
+
+            Raises:
+                ValueError: If hierarchical is True and input dataset is a Polars LazyFrame.
             """
+            if hierarchical and isinstance(Depseudonymize.dataset, pl.LazyFrame):
+                raise ValueError(
+                    "Hierarchical datasets are not supported for Polars LazyFrames."
+                )
+
             super().__init__(
                 pseudo_operation=PseudoOperation.DEPSEUDONYMIZE,
                 dataset=Depseudonymize.dataset,
 
@@ -81,5 +81,5 @@ def __add__(self, other: "RawPseudoMetadata | None") -> "RawPseudoMetadata":
 class PseudoFieldResponse:
     """PseudoFieldResponse holds the data and metadata from a Pseudo Service field response."""
 
-    data: pl.DataFrame
+    data: pl.DataFrame | pl.LazyFrame
     raw_metadata: list[RawPseudoMetadata]
@@ -57,26 +57,43 @@ def get_value(self) -> list[str | int | None]:
 class MutableDataFrame:
     """A DataFrame that can change values in-place."""
 
-    def __init__(self, dataframe: pl.DataFrame, hierarchical: bool) -> None:
+    def __init__(
+        self, dataframe: pl.DataFrame | pl.LazyFrame, hierarchical: bool
+    ) -> None:
         """Initialize the class."""
-        self.dataset: pl.DataFrame | dict[str, Any] = dataframe
+        self.dataset: pl.DataFrame | dict[str, Any] | pl.LazyFrame = dataframe
         self.matched_fields: dict[str, FieldMatch] = {}
         self.matched_fields_metrics: dict[str, int] | None = None
         self.hierarchical: bool = hierarchical
-        self.schema = dataframe.schema
+        self.schema = (
+            dataframe.schema
+            if isinstance(dataframe, pl.DataFrame)
+            else dataframe.collect_schema()
+        )
 
     def match_rules(
         self, rules: list[PseudoRule], target_rules: list[PseudoRule] | None
     ) -> None:
         """Create references to all the columns that matches the given pseudo rules."""
         if self.hierarchical is False:
-            assert isinstance(self.dataset, pl.DataFrame)
+            assert isinstance(self.dataset, pl.DataFrame) or isinstance(
+                self.dataset, pl.LazyFrame
+            )
+
+            def extract_column_data(
+                pattern: str, dataset: pl.DataFrame | pl.LazyFrame
+            ) -> list[Any]:
+                if isinstance(dataset, pl.DataFrame):
+                    return list(dataset.get_column(pattern))
+                elif isinstance(dataset, pl.LazyFrame):
+                    return list(dataset.select(pattern).collect().to_series())
+
             self.matched_fields = {
                 str(i): FieldMatch(
                     path=rule.pattern,
                     pattern=rule.pattern,
                     indexer=[],
-                    col=list(self.dataset.get_column(rule.pattern)),
+                    col=extract_column_data(rule.pattern, self.dataset),
                     wrapped_list=False,
                     func=rule.func,
                     target_func=target_rule.func if target_rule else None,
@@ -109,7 +126,9 @@ def get_matched_fields(self) -> dict[str, FieldMatch]:
     def update(self, path: str, data: list[str | None]) -> None:
         """Update a column with the given data."""
         if self.hierarchical is False:
-            assert isinstance(self.dataset, pl.DataFrame)
+            assert isinstance(self.dataset, pl.DataFrame) or isinstance(
+                self.dataset, pl.LazyFrame
+            )
             self.dataset = self.dataset.with_columns(pl.Series(data).alias(path))
         elif (field_match := self.matched_fields.get(path)) is not None:
             assert isinstance(self.dataset, dict)
@@ -122,10 +141,12 @@ def update(self, path: str, data: list[str | None]) -> None:
                 data if field_match.wrapped_list is False else data[0]
             )
 
-    def to_polars(self) -> pl.DataFrame:
+    def to_polars(self) -> pl.DataFrame | pl.LazyFrame:
         """Convert to Polars DataFrame."""
         if self.hierarchical is False:
-            assert isinstance(self.dataset, pl.DataFrame)
+            assert isinstance(self.dataset, pl.DataFrame) or isinstance(
+                self.dataset, pl.LazyFrame
+            )
             return self.dataset
         else:
             assert isinstance(self.dataset, dict)
 
@@ -25,7 +25,7 @@ class Pseudonymize:
     This class should not be instantiated, only the static methods should be used.
     """
 
-    dataset: pl.DataFrame
+    dataset: pl.DataFrame | pl.LazyFrame
     schema: pd.Series | pl.Schema
 
     @staticmethod
@@ -43,7 +43,9 @@ def from_pandas(dataframe: pd.DataFrame) -> "Pseudonymize._Pseudonymizer":
         return Pseudonymize._Pseudonymizer()
 
     @staticmethod
-    def from_polars(dataframe: pl.DataFrame) -> "Pseudonymize._Pseudonymizer":
+    def from_polars(
+        dataframe: pl.DataFrame | pl.LazyFrame,
+    ) -> "Pseudonymize._Pseudonymizer":
         """Initialize a pseudonymization request from a Polars DataFrame.
 
         Args:
@@ -53,7 +55,11 @@ def from_polars(dataframe: pl.DataFrame) -> "Pseudonymize._Pseudonymizer":
             _Pseudonymizer: An instance of the _Pseudonymizer class.
         """
         Pseudonymize.dataset = dataframe
-        Pseudonymize.schema = dataframe.schema
+        Pseudonymize.schema = (
+            dataframe.schema
+            if type(dataframe) is pl.DataFrame
+            else dataframe.collect_schema()
+        )
         return Pseudonymize._Pseudonymizer()
 
     class _Pseudonymizer(_BasePseudonymizer):
@@ -107,7 +113,15 @@ def run(
 
             Returns:
                 Result: The pseudonymized dataset and the associated metadata.
+
+            Raises:
+                ValueError: If hierarchical is True and input dataset is a Polars LazyFrame.
             """
+            if hierarchical and type(Pseudonymize.dataset) is pl.LazyFrame:
+                raise ValueError(
+                    "Hierarchical datasets are not supported for Polars LazyFrames."
+                )
+
             super().__init__(
                 pseudo_operation=PseudoOperation.PSEUDONYMIZE,
                 dataset=Pseudonymize.dataset,
 
@@ -24,7 +24,7 @@ class Repseudonymize:
     This class should not be instantiated, only the static methods should be used.
     """
 
-    dataset: pl.DataFrame
+    dataset: pl.DataFrame | pl.LazyFrame
     schema: pd.Series | pl.Schema
 
     @staticmethod
@@ -35,10 +35,16 @@ def from_pandas(dataframe: pd.DataFrame) -> "Repseudonymize._Repseudonymizer":
         return Repseudonymize._Repseudonymizer()
 
     @staticmethod
-    def from_polars(dataframe: pl.DataFrame) -> "Repseudonymize._Repseudonymizer":
+    def from_polars(
+        dataframe: pl.DataFrame | pl.LazyFrame,
+    ) -> "Repseudonymize._Repseudonymizer":
         """Initialize a pseudonymization request from a polars DataFrame."""
         Repseudonymize.dataset = dataframe
-        Repseudonymize.schema = dataframe.schema
+        Repseudonymize.schema = (
+            dataframe.schema
+            if type(dataframe) is pl.DataFrame
+            else dataframe.collect_schema()
+        )
         return Repseudonymize._Repseudonymizer()
 
     class _Repseudonymizer(_BasePseudonymizer):
@@ -99,7 +105,15 @@ def run(
 
             Returns:
                 Result: The pseudonymized dataset and the associated metadata.
+
+            Raises:
+                ValueError: If hierarchical is True and input dataset is a Polars LazyFrame.
             """
+            if hierarchical and isinstance(Repseudonymize.dataset, pl.LazyFrame):
+                raise ValueError(
+                    "Hierarchical datasets are not supported for Polars LazyFrames."
+                )
+
             super().__init__(
                 pseudo_operation=PseudoOperation.REPSEUDONYMIZE,
                 dataset=Repseudonymize.dataset,