@@ -42,14 +42,14 @@ class DummyStructuredModel(BaseModel):
4242@pytest .fixture
4343def mock_fetch_seed_dataset_column_names ():
4444 with patch ("data_designer.config.config_builder.fetch_seed_dataset_column_names" ) as mock_fetch_seed :
45- mock_fetch_seed .return_value = ["id" , "name" , "age " , "city " ]
45+ mock_fetch_seed .return_value = ["id" , "name" , "city " , "country " ]
4646 yield mock_fetch_seed
4747
4848
4949@pytest .fixture
5050def stub_data_designer_builder (stub_data_designer_builder_config_str ):
5151 with patch ("data_designer.config.config_builder.fetch_seed_dataset_column_names" ) as mock_fetch_seed :
52- mock_fetch_seed .return_value = ["id" , "name" , "age " , "city " ]
52+ mock_fetch_seed .return_value = ["id" , "name" , "city " , "country " ]
5353 yield DataDesignerConfigBuilder .from_config (config = stub_data_designer_builder_config_str )
5454
5555
@@ -404,25 +404,25 @@ def test_delete_constraints(stub_data_designer_builder):
404404
405405
406406def test_delete_column (stub_data_designer_builder ):
407- assert len (stub_data_designer_builder .get_columns_of_type (DataDesignerColumnType .SAMPLER )) == 3
407+ assert len (stub_data_designer_builder .get_columns_of_type (DataDesignerColumnType .SAMPLER )) == 4
408408 stub_data_designer_builder .delete_column (column_name = "code_id" )
409- assert len (stub_data_designer_builder .get_columns_of_type (DataDesignerColumnType .SAMPLER )) == 2
409+ assert len (stub_data_designer_builder .get_columns_of_type (DataDesignerColumnType .SAMPLER )) == 3
410410
411411 with pytest .raises (
412412 BuilderConfigurationError , match = "Seed columns cannot be deleted. Please update the seed dataset instead."
413413 ):
414- stub_data_designer_builder .delete_column (column_name = "age " )
414+ stub_data_designer_builder .delete_column (column_name = "id " )
415415
416416
417417def test_getters (stub_data_designer_builder ):
418- assert len (stub_data_designer_builder .get_column_configs ()) == 11
418+ assert len (stub_data_designer_builder .get_column_configs ()) == 12
419419 assert stub_data_designer_builder .get_column_config (name = "code_id" ).name == "code_id"
420420 assert len (stub_data_designer_builder .get_constraints (target_column = "age" )) == 1
421421 assert len (stub_data_designer_builder .get_llm_gen_columns ()) == 3
422- assert len (stub_data_designer_builder .get_columns_of_type (DataDesignerColumnType .SAMPLER )) == 3
422+ assert len (stub_data_designer_builder .get_columns_of_type (DataDesignerColumnType .SAMPLER )) == 4
423423 assert len (stub_data_designer_builder .get_columns_excluding_type (DataDesignerColumnType .SAMPLER )) == 8
424424 assert stub_data_designer_builder .get_seed_config ().dataset == "test-repo/testing/data.csv"
425- assert stub_data_designer_builder .num_columns_of_type (DataDesignerColumnType .SAMPLER ) == 3
425+ assert stub_data_designer_builder .num_columns_of_type (DataDesignerColumnType .SAMPLER ) == 4
426426
427427
428428def test_write_config (stub_data_designer_builder ):
@@ -759,3 +759,90 @@ def test_delete_model_config(stub_empty_builder):
759759
760760 assert result is stub_empty_builder
761761 assert len (stub_empty_builder .model_configs ) == 2
762+
763+
764+ def test_add_column_collision_with_seed_dataset (stub_empty_builder : DataDesignerConfigBuilder ) -> None :
765+ """Test that adding a column that collides with a seed dataset column raises an error."""
766+ datastore_settings = DatastoreSettings (endpoint = "https://huggingface.co" , token = "test-token" )
767+
768+ with patch ("data_designer.config.config_builder.fetch_seed_dataset_column_names" ) as mock_fetch :
769+ mock_fetch .return_value = ["id" , "name" , "age" ]
770+ stub_empty_builder .with_seed_dataset (
771+ DatastoreSeedDatasetReference (dataset = "test-repo/test-data.parquet" , datastore_settings = datastore_settings )
772+ )
773+
774+ with pytest .raises (
775+ BuilderConfigurationError ,
776+ match = "Column 'id' already exists as a seed dataset column" ,
777+ ):
778+ stub_empty_builder .add_column (
779+ name = "id" ,
780+ column_type = DataDesignerColumnType .SAMPLER ,
781+ sampler_type = SamplerType .UUID ,
782+ )
783+
784+ with pytest .raises (
785+ BuilderConfigurationError ,
786+ match = "Column 'name' already exists as a seed dataset column" ,
787+ ):
788+ stub_empty_builder .add_column (
789+ LLMTextColumnConfig (
790+ name = "name" ,
791+ prompt = "Write a name" ,
792+ model_alias = "stub-model" ,
793+ )
794+ )
795+
796+
797+ def test_with_seed_dataset_collision_with_existing_columns (stub_empty_builder : DataDesignerConfigBuilder ) -> None :
798+ """Test that adding a seed dataset with columns that collide with existing columns raises an error."""
799+ stub_empty_builder .add_column (
800+ name = "name" ,
801+ column_type = DataDesignerColumnType .LLM_TEXT ,
802+ prompt = "Write a name" ,
803+ model_alias = "stub-model" ,
804+ )
805+ stub_empty_builder .add_column (
806+ name = "age" ,
807+ column_type = DataDesignerColumnType .SAMPLER ,
808+ sampler_type = SamplerType .UNIFORM ,
809+ params = {"low" : 1 , "high" : 100 },
810+ )
811+
812+ datastore_settings = DatastoreSettings (endpoint = "https://huggingface.co" , token = "test-token" )
813+
814+ with patch ("data_designer.config.config_builder.fetch_seed_dataset_column_names" ) as mock_fetch :
815+ mock_fetch .return_value = ["id" , "name" , "age" , "city" ]
816+ with pytest .raises (
817+ BuilderConfigurationError ,
818+ match = r"Seed dataset column\(s\) \['name', 'age'\] collide with existing column\(s\)" ,
819+ ):
820+ stub_empty_builder .with_seed_dataset (
821+ DatastoreSeedDatasetReference (
822+ dataset = "test-repo/test-data.parquet" , datastore_settings = datastore_settings
823+ )
824+ )
825+
826+ assert stub_empty_builder .get_seed_config () is None
827+ assert len (stub_empty_builder .get_columns_of_type (DataDesignerColumnType .SEED_DATASET )) == 0
828+
829+
830+ def test_with_seed_dataset_no_collision (stub_empty_builder : DataDesignerConfigBuilder ) -> None :
831+ """Test that adding a seed dataset with non-colliding columns works fine."""
832+ stub_empty_builder .add_column (
833+ name = "unique_column" ,
834+ column_type = DataDesignerColumnType .SAMPLER ,
835+ sampler_type = SamplerType .UUID ,
836+ )
837+
838+ datastore_settings = DatastoreSettings (endpoint = "https://huggingface.co" , token = "test-token" )
839+
840+ with patch ("data_designer.config.config_builder.fetch_seed_dataset_column_names" ) as mock_fetch :
841+ mock_fetch .return_value = ["id" , "name" , "age" ]
842+ stub_empty_builder .with_seed_dataset (
843+ DatastoreSeedDatasetReference (dataset = "test-repo/test-data.parquet" , datastore_settings = datastore_settings )
844+ )
845+
846+ assert stub_empty_builder .get_seed_config () is not None
847+ assert len (stub_empty_builder .get_columns_of_type (DataDesignerColumnType .SEED_DATASET )) == 3
848+ assert len (stub_empty_builder .get_columns_of_type (DataDesignerColumnType .SAMPLER )) == 1
0 commit comments