Merge pull request #56 from transformerlab/add/save-dataset

deep1401 · web-flow · commit ea0260e786ae · 2025-10-30T15:20:31.000-06:00
Add lab.save_dataset() functionality
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "transformerlab"
-version = "0.0.39"
+version = "0.0.40"
 description = "Python SDK for Transformer Lab"
 readme = "README.md"
 requires-python = ">=3.10"
diff --git a/src/lab/lab_facade.py b/src/lab/lab_facade.py
@@ -9,6 +9,7 @@
 from .job import Job
 from . import dirs
 from .model import Model as ModelService
+from .dataset import Dataset
 
 class Lab:
     """
@@ -151,6 +152,98 @@ def save_artifact(self, source_path: str, name: Optional[str] = None) -> str:
 
         return dest
 
+    def save_dataset(self, df, dataset_id: str, additional_metadata: Optional[Dict[str, Any]] = None, suffix: Optional[str] = None, is_image: bool = False) -> str:
+        """
+        Save a dataset under the workspace datasets directory and mark it as generated.
+
+        Args:
+            df: A pandas DataFrame or a Hugging Face datasets.Dataset to serialize to disk.
+            dataset_id: Identifier for the dataset directory under `datasets/`.
+            additional_metadata: Optional dict to merge into dataset json_data.
+            suffix: Optional suffix to append to the output filename stem.
+            is_image: If True, save JSON Lines (for image metadata-style rows).
+
+        Returns:
+            The path to the saved dataset file on disk.
+        """
+        self._ensure_initialized()
+        if not isinstance(dataset_id, str) or dataset_id.strip() == "":
+            raise ValueError("dataset_id must be a non-empty string")
+
+        # Normalize input: convert Hugging Face datasets.Dataset to pandas DataFrame
+        try:
+            if hasattr(df, "to_pandas") and callable(getattr(df, "to_pandas")):
+                df = df.to_pandas()
+        except Exception:
+            pass
+
+        # Prepare dataset directory
+        dataset_id_safe = dataset_id.strip()
+        dataset_dir = dirs.dataset_dir_by_id(dataset_id_safe)
+        # If exists, then raise an error
+        if os.path.exists(dataset_dir):
+            raise FileExistsError(f"Dataset with ID {dataset_id_safe} already exists")
+        os.makedirs(dataset_dir, exist_ok=True)
+
+        # Determine output filename
+        if is_image:
+            lines = True
+            output_filename = "metadata.jsonl"
+        else:
+            lines = False
+            stem = dataset_id_safe
+            if isinstance(suffix, str) and suffix.strip() != "":
+                stem = f"{stem}_{suffix.strip()}"
+            output_filename = f"{stem}.json"
+
+        output_path = os.path.join(dataset_dir, output_filename)
+
+        # Persist dataframe
+        try:
+            if not hasattr(df, "to_json"):
+                raise TypeError("df must be a pandas DataFrame or a Hugging Face datasets.Dataset")
+            df.to_json(output_path, orient="records", lines=lines)
+        except Exception as e:
+            raise RuntimeError(f"Failed to save dataset to {output_path}: {str(e)}")
+
+        # Create or update filesystem metadata so it appears under generated datasets
+        try:
+            try:
+                ds = Dataset.get(dataset_id_safe)
+            except FileNotFoundError:
+                ds = Dataset.create(dataset_id_safe)
+
+            # Base json_data with generated flag for UI filtering
+            json_data: Dict[str, Any] = {
+                "generated": True,
+                "sample_count": len(df) if hasattr(df, "__len__") else -1,
+                "files": [output_filename],
+            }
+            if additional_metadata and isinstance(additional_metadata, dict):
+                json_data.update(additional_metadata)
+
+            ds.set_metadata(
+                location="local",
+                description=json_data.get("description", ""),
+                size=-1,
+                json_data=json_data,
+            )
+        except Exception as e:
+            # Do not fail the save if metadata write fails; log to job data
+            try:
+                self._job.update_job_data_field("dataset_metadata_error", str(e))  # type: ignore[union-attr]
+            except Exception:
+                pass
+
+        # Track dataset on the job for provenance
+        try:
+            self._job.update_job_data_field("dataset_id", dataset_id_safe)  # type: ignore[union-attr]
+        except Exception:
+            pass
+
+        self.log(f"Dataset saved to '{output_path}' and registered as generated dataset '{dataset_id_safe}'")
+        return output_path
+
     def save_checkpoint(self, source_path: str, name: Optional[str] = None) -> str:
         """
         Save a checkpoint file or directory into this job's checkpoints folder.