Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion src/data_designer/engine/dataset_builders/artifact_storage.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from datetime import datetime
from functools import cached_property
import json
import logging
from pathlib import Path
Expand Down Expand Up @@ -36,9 +38,21 @@ class ArtifactStorage(BaseModel):
def artifact_path_exists(self) -> bool:
return self.artifact_path.exists()

@cached_property
def resolved_dataset_name(self) -> str:
dataset_path = self.artifact_path / self.dataset_name
if dataset_path.exists() and len(list(dataset_path.iterdir())) > 0:
new_dataset_name = f"{self.dataset_name}_{datetime.now().strftime('%m-%d-%Y_%H%M%S')}"
logger.info(
f"πŸ“‚ Dataset path {str(dataset_path)!r} already exists. Dataset from this session"
f"\n\t\t will be saved to {str(self.artifact_path / new_dataset_name)!r} instead."
)
return new_dataset_name
return self.dataset_name

@property
def base_dataset_path(self) -> Path:
return self.artifact_path / self.dataset_name
return self.artifact_path / self.resolved_dataset_name

@property
def dropped_columns_dataset_path(self) -> Path:
Expand Down
6 changes: 5 additions & 1 deletion src/data_designer/interface/data_designer.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,11 @@ def create(
configuration (columns, constraints, seed data, etc.).
num_records: Number of records to generate.
dataset_name: Name of the dataset. This name will be used as the dataset
folder name in the artifact path directory.
folder name in the artifact path directory. If a non-empty directory with the
same name already exists, dataset will be saved to a new directory with
a datetime stamp. For example, if the dataset name is "awesome_dataset" and a directory
with the same name already exists, the dataset will be saved to a new directory
with the name "awesome_dataset_2025-01-01_12-00-00".

Returns:
DatasetCreationResults object with methods for loading the generated dataset,
Expand Down
21 changes: 21 additions & 0 deletions tests/engine/dataset_builders/test_artifact_storage.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from datetime import datetime
import json
from unittest.mock import patch

import pandas as pd
from pyarrow import ArrowNotImplementedError
Expand Down Expand Up @@ -213,3 +215,22 @@ def test_artifact_storage_batch_numbering(stub_artifact_storage, batch_number):
path = stub_artifact_storage.create_batch_file_path(batch_number, BatchStage.FINAL_RESULT)
expected_name = f"batch_{batch_number:05d}.parquet"
assert path.name == expected_name


@patch("data_designer.engine.dataset_builders.artifact_storage.datetime")
def test_artifact_storage_resolved_dataset_name(mock_datetime, tmp_path):
mock_datetime.now.return_value = datetime(2025, 1, 1, 12, 3, 4)

# dataset path does not exist yet
assert ArtifactStorage(artifact_path=tmp_path).resolved_dataset_name == "dataset"

# dataset path exists but is empty
af_storage = ArtifactStorage(artifact_path=tmp_path)
(af_storage.artifact_path / af_storage.dataset_name).mkdir()
assert af_storage.resolved_dataset_name == "dataset"

# dataset path exists and is not empty
af_storage = ArtifactStorage(artifact_path=tmp_path)
(af_storage.artifact_path / af_storage.dataset_name / "stub_file.txt").touch()
print(af_storage.resolved_dataset_name)
assert af_storage.resolved_dataset_name == "dataset_01-01-2025_120304"