Skip to content

Commit 478358c

Browse files
fix: seed datasets replace existing columns when names collide (#158)
* fix: seed datasets replace existing columns when names collide * use !r Co-authored-by: Nabin Mulepati <[email protected]> --------- Co-authored-by: Nabin Mulepati <[email protected]>
1 parent a7d25dc commit 478358c

File tree

3 files changed

+118
-10
lines changed

3 files changed

+118
-10
lines changed

src/data_designer/config/config_builder.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,9 @@ def add_column(
224224
225225
Returns:
226226
The current Data Designer config builder instance.
227+
228+
Raises:
229+
BuilderConfigurationError: If the column name collides with an existing seed dataset column.
227230
"""
228231
if column_config is None:
229232
if name is None or column_type is None:
@@ -240,6 +243,13 @@ def add_column(
240243
f"{', '.join([t.__name__ for t in allowed_column_configs])}"
241244
)
242245

246+
existing_config = self._column_configs.get(column_config.name)
247+
if existing_config is not None and isinstance(existing_config, SeedDatasetColumnConfig):
248+
raise BuilderConfigurationError(
249+
f"🛑 Column {column_config.name!r} already exists as a seed dataset column. "
250+
"Please use a different column name or update the seed dataset."
251+
)
252+
243253
self._column_configs[column_config.name] = column_config
244254
return self
245255

@@ -578,7 +588,18 @@ def with_seed_dataset(
578588
579589
Returns:
580590
The current Data Designer config builder instance.
591+
592+
Raises:
593+
BuilderConfigurationError: If any seed dataset column name collides with an existing column.
581594
"""
595+
seed_column_names = fetch_seed_dataset_column_names(dataset_reference)
596+
colliding_columns = [name for name in seed_column_names if name in self._column_configs]
597+
if colliding_columns:
598+
raise BuilderConfigurationError(
599+
f"🛑 Seed dataset column(s) {colliding_columns} collide with existing column(s). "
600+
"Please remove the conflicting columns or use a seed dataset with different column names."
601+
)
602+
582603
self._seed_config = SeedConfig(
583604
dataset=dataset_reference.dataset,
584605
sampling_strategy=sampling_strategy,
@@ -587,7 +608,7 @@ def with_seed_dataset(
587608
self.set_seed_datastore_settings(
588609
dataset_reference.datastore_settings if hasattr(dataset_reference, "datastore_settings") else None
589610
)
590-
for column_name in fetch_seed_dataset_column_names(dataset_reference):
611+
for column_name in seed_column_names:
591612
self._column_configs[column_name] = SeedDatasetColumnConfig(name=column_name)
592613
return self
593614

tests/config/test_config_builder.py

Lines changed: 95 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -42,14 +42,14 @@ class DummyStructuredModel(BaseModel):
4242
@pytest.fixture
4343
def mock_fetch_seed_dataset_column_names():
4444
with patch("data_designer.config.config_builder.fetch_seed_dataset_column_names") as mock_fetch_seed:
45-
mock_fetch_seed.return_value = ["id", "name", "age", "city"]
45+
mock_fetch_seed.return_value = ["id", "name", "city", "country"]
4646
yield mock_fetch_seed
4747

4848

4949
@pytest.fixture
5050
def stub_data_designer_builder(stub_data_designer_builder_config_str):
5151
with patch("data_designer.config.config_builder.fetch_seed_dataset_column_names") as mock_fetch_seed:
52-
mock_fetch_seed.return_value = ["id", "name", "age", "city"]
52+
mock_fetch_seed.return_value = ["id", "name", "city", "country"]
5353
yield DataDesignerConfigBuilder.from_config(config=stub_data_designer_builder_config_str)
5454

5555

@@ -404,25 +404,25 @@ def test_delete_constraints(stub_data_designer_builder):
404404

405405

406406
def test_delete_column(stub_data_designer_builder):
407-
assert len(stub_data_designer_builder.get_columns_of_type(DataDesignerColumnType.SAMPLER)) == 3
407+
assert len(stub_data_designer_builder.get_columns_of_type(DataDesignerColumnType.SAMPLER)) == 4
408408
stub_data_designer_builder.delete_column(column_name="code_id")
409-
assert len(stub_data_designer_builder.get_columns_of_type(DataDesignerColumnType.SAMPLER)) == 2
409+
assert len(stub_data_designer_builder.get_columns_of_type(DataDesignerColumnType.SAMPLER)) == 3
410410

411411
with pytest.raises(
412412
BuilderConfigurationError, match="Seed columns cannot be deleted. Please update the seed dataset instead."
413413
):
414-
stub_data_designer_builder.delete_column(column_name="age")
414+
stub_data_designer_builder.delete_column(column_name="id")
415415

416416

417417
def test_getters(stub_data_designer_builder):
418-
assert len(stub_data_designer_builder.get_column_configs()) == 11
418+
assert len(stub_data_designer_builder.get_column_configs()) == 12
419419
assert stub_data_designer_builder.get_column_config(name="code_id").name == "code_id"
420420
assert len(stub_data_designer_builder.get_constraints(target_column="age")) == 1
421421
assert len(stub_data_designer_builder.get_llm_gen_columns()) == 3
422-
assert len(stub_data_designer_builder.get_columns_of_type(DataDesignerColumnType.SAMPLER)) == 3
422+
assert len(stub_data_designer_builder.get_columns_of_type(DataDesignerColumnType.SAMPLER)) == 4
423423
assert len(stub_data_designer_builder.get_columns_excluding_type(DataDesignerColumnType.SAMPLER)) == 8
424424
assert stub_data_designer_builder.get_seed_config().dataset == "test-repo/testing/data.csv"
425-
assert stub_data_designer_builder.num_columns_of_type(DataDesignerColumnType.SAMPLER) == 3
425+
assert stub_data_designer_builder.num_columns_of_type(DataDesignerColumnType.SAMPLER) == 4
426426

427427

428428
def test_write_config(stub_data_designer_builder):
@@ -759,3 +759,90 @@ def test_delete_model_config(stub_empty_builder):
759759

760760
assert result is stub_empty_builder
761761
assert len(stub_empty_builder.model_configs) == 2
762+
763+
764+
def test_add_column_collision_with_seed_dataset(stub_empty_builder: DataDesignerConfigBuilder) -> None:
765+
"""Test that adding a column that collides with a seed dataset column raises an error."""
766+
datastore_settings = DatastoreSettings(endpoint="https://huggingface.co", token="test-token")
767+
768+
with patch("data_designer.config.config_builder.fetch_seed_dataset_column_names") as mock_fetch:
769+
mock_fetch.return_value = ["id", "name", "age"]
770+
stub_empty_builder.with_seed_dataset(
771+
DatastoreSeedDatasetReference(dataset="test-repo/test-data.parquet", datastore_settings=datastore_settings)
772+
)
773+
774+
with pytest.raises(
775+
BuilderConfigurationError,
776+
match="Column 'id' already exists as a seed dataset column",
777+
):
778+
stub_empty_builder.add_column(
779+
name="id",
780+
column_type=DataDesignerColumnType.SAMPLER,
781+
sampler_type=SamplerType.UUID,
782+
)
783+
784+
with pytest.raises(
785+
BuilderConfigurationError,
786+
match="Column 'name' already exists as a seed dataset column",
787+
):
788+
stub_empty_builder.add_column(
789+
LLMTextColumnConfig(
790+
name="name",
791+
prompt="Write a name",
792+
model_alias="stub-model",
793+
)
794+
)
795+
796+
797+
def test_with_seed_dataset_collision_with_existing_columns(stub_empty_builder: DataDesignerConfigBuilder) -> None:
798+
"""Test that adding a seed dataset with columns that collide with existing columns raises an error."""
799+
stub_empty_builder.add_column(
800+
name="name",
801+
column_type=DataDesignerColumnType.LLM_TEXT,
802+
prompt="Write a name",
803+
model_alias="stub-model",
804+
)
805+
stub_empty_builder.add_column(
806+
name="age",
807+
column_type=DataDesignerColumnType.SAMPLER,
808+
sampler_type=SamplerType.UNIFORM,
809+
params={"low": 1, "high": 100},
810+
)
811+
812+
datastore_settings = DatastoreSettings(endpoint="https://huggingface.co", token="test-token")
813+
814+
with patch("data_designer.config.config_builder.fetch_seed_dataset_column_names") as mock_fetch:
815+
mock_fetch.return_value = ["id", "name", "age", "city"]
816+
with pytest.raises(
817+
BuilderConfigurationError,
818+
match=r"Seed dataset column\(s\) \['name', 'age'\] collide with existing column\(s\)",
819+
):
820+
stub_empty_builder.with_seed_dataset(
821+
DatastoreSeedDatasetReference(
822+
dataset="test-repo/test-data.parquet", datastore_settings=datastore_settings
823+
)
824+
)
825+
826+
assert stub_empty_builder.get_seed_config() is None
827+
assert len(stub_empty_builder.get_columns_of_type(DataDesignerColumnType.SEED_DATASET)) == 0
828+
829+
830+
def test_with_seed_dataset_no_collision(stub_empty_builder: DataDesignerConfigBuilder) -> None:
831+
"""Test that adding a seed dataset with non-colliding columns works fine."""
832+
stub_empty_builder.add_column(
833+
name="unique_column",
834+
column_type=DataDesignerColumnType.SAMPLER,
835+
sampler_type=SamplerType.UUID,
836+
)
837+
838+
datastore_settings = DatastoreSettings(endpoint="https://huggingface.co", token="test-token")
839+
840+
with patch("data_designer.config.config_builder.fetch_seed_dataset_column_names") as mock_fetch:
841+
mock_fetch.return_value = ["id", "name", "age"]
842+
stub_empty_builder.with_seed_dataset(
843+
DatastoreSeedDatasetReference(dataset="test-repo/test-data.parquet", datastore_settings=datastore_settings)
844+
)
845+
846+
assert stub_empty_builder.get_seed_config() is not None
847+
assert len(stub_empty_builder.get_columns_of_type(DataDesignerColumnType.SEED_DATASET)) == 3
848+
assert len(stub_empty_builder.get_columns_of_type(DataDesignerColumnType.SAMPLER)) == 1

tests/conftest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ def stub_empty_builder(stub_model_configs: list[ModelConfig]) -> DataDesignerCon
164164
@pytest.fixture
165165
def stub_complete_builder(stub_data_designer_builder_config_str: str) -> DataDesignerConfigBuilder:
166166
with patch("data_designer.config.config_builder.fetch_seed_dataset_column_names") as mock_fetch:
167-
mock_fetch.return_value = ["id", "name", "age", "city"]
167+
mock_fetch.return_value = ["id", "name", "city", "country"]
168168
return DataDesignerConfigBuilder.from_config(config=stub_data_designer_builder_config_str)
169169

170170

0 commit comments

Comments
 (0)