fix check during column name fetch

nabinchha · nabinchha · commit 690b00f7b0a2 · 2025-11-05T11:23:31.000-07:00
diff --git a/src/data_designer/config/datastore.py b/src/data_designer/config/datastore.py
@@ -32,7 +32,15 @@ class DatastoreSettings(BaseModel):
 
 
 def get_file_column_names(file_path: Union[str, Path], file_type: str) -> list[str]:
-    """Extract column names based on file type."""
+    """Extract column names based on file type. Supports glob patterns like '../path/*.parquet'."""
+    file_path = Path(file_path)
+    if "*" in str(file_path):
+        matching_files = sorted(file_path.parent.glob(file_path.name))
+        if not matching_files:
+            raise InvalidFilePathError(f"🛑 No files found matching pattern: {str(file_path)!r}")
+        logger.info(f"0️⃣ Using the first matching file in {str(file_path)!r} to determine column names in seed dataset")
+        file_path = matching_files[0]
+
     if file_type == "parquet":
         try:
             schema = pq.read_schema(file_path)
@@ -123,11 +131,17 @@ def _fetch_seed_dataset_column_names_from_datastore(
 
 
 def _fetch_seed_dataset_column_names_from_local_file(dataset_path: str | Path) -> list[str]:
-    dataset_path = _validate_dataset_path(dataset_path)
-    return get_file_column_names(dataset_path, dataset_path.suffix.lower()[1:])
+    dataset_path = _validate_dataset_path(dataset_path, allow_glob_pattern=True)
+    return get_file_column_names(dataset_path, str(dataset_path).split(".")[-1])
 
 
-def _validate_dataset_path(dataset_path: Union[str, Path]) -> Path:
+def _validate_dataset_path(dataset_path: Union[str, Path], allow_glob_pattern: bool = False) -> Path:
+    if allow_glob_pattern and "*" in str(dataset_path):
+        valid_wild_card_versions = {f"*{ext}" for ext in VALID_DATASET_FILE_EXTENSIONS}
+        if not any(dataset_path.endswith(wildcard) for wildcard in valid_wild_card_versions):
+            file_extension = dataset_path.split("*.")[-1]
+            raise InvalidFilePathError(f"🛑 Path {dataset_path!r} does not contain files of type {file_extension!r}.")
+        return Path(dataset_path)
     if not Path(dataset_path).is_file():
         raise InvalidFilePathError("🛑 To upload a dataset to the datastore, you must provide a valid file path.")
     if not Path(dataset_path).name.endswith(tuple(VALID_DATASET_FILE_EXTENSIONS)):
diff --git a/tests/config/test_datastore.py b/tests/config/test_datastore.py
@@ -36,7 +36,7 @@ def _write_file(df, path, file_type):
 
 @pytest.mark.parametrize("file_type", ["parquet", "json", "jsonl", "csv"])
 def test_get_file_column_names_basic_parquet(tmp_path, file_type):
-    """Test _get_file_column_names with basic parquet file."""
+    """Test get_file_column_names with basic parquet file."""
     test_data = {
         "id": [1, 2, 3],
         "name": ["Alice", "Bob", "Charlie"],
@@ -51,7 +51,7 @@ def test_get_file_column_names_basic_parquet(tmp_path, file_type):
 
 
 def test_get_file_column_names_nested_fields(tmp_path):
-    """Test _get_file_column_names with nested fields in parquet."""
+    """Test get_file_column_names with nested fields in parquet."""
     schema = pa.schema(
         [
             pa.field(
@@ -72,7 +72,7 @@ def test_get_file_column_names_nested_fields(tmp_path):
 
 @pytest.mark.parametrize("file_type", ["parquet", "json", "jsonl", "csv"])
 def test_get_file_column_names_empty_parquet(tmp_path, file_type):
-    """Test _get_file_column_names with empty parquet file."""
+    """Test get_file_column_names with empty parquet file."""
     empty_df = pd.DataFrame()
     empty_path = tmp_path / f"empty.{file_type}"
     _write_file(empty_df, empty_path, file_type)
@@ -83,7 +83,7 @@ def test_get_file_column_names_empty_parquet(tmp_path, file_type):
 
 @pytest.mark.parametrize("file_type", ["parquet", "json", "jsonl", "csv"])
 def test_get_file_column_names_large_schema(tmp_path, file_type):
-    """Test _get_file_column_names with many columns."""
+    """Test get_file_column_names with many columns."""
     num_columns = 50
     test_data = {f"col_{i}": np.random.randn(10) for i in range(num_columns)}
     df = pd.DataFrame(test_data)
@@ -98,7 +98,7 @@ def test_get_file_column_names_large_schema(tmp_path, file_type):
 
 @pytest.mark.parametrize("file_type", ["parquet", "json", "jsonl", "csv"])
 def test_get_file_column_names_special_characters(tmp_path, file_type):
-    """Test _get_file_column_names with special characters in column names."""
+    """Test get_file_column_names with special characters in column names."""
     special_data = {
         "column with spaces": [1],
         "column-with-dashes": [2],
@@ -117,7 +117,7 @@ def test_get_file_column_names_special_characters(tmp_path, file_type):
 
 @pytest.mark.parametrize("file_type", ["parquet", "json", "jsonl", "csv"])
 def test_get_file_column_names_unicode(tmp_path, file_type):
-    """Test _get_file_column_names with unicode column names."""
+    """Test get_file_column_names with unicode column names."""
     unicode_data = {"café": [1], "résumé": [2], "naïve": [3], "façade": [4], "garçon": [5], "über": [6], "schön": [7]}
     df_unicode = pd.DataFrame(unicode_data)
 
@@ -126,6 +126,22 @@ def test_get_file_column_names_unicode(tmp_path, file_type):
     assert get_file_column_names(str(unicode_path), file_type) == df_unicode.columns.tolist()
 
 
+@pytest.mark.parametrize("file_type", ["parquet", "csv", "json", "jsonl"])
+def test_get_file_column_names_with_glob_pattern(tmp_path, file_type):
+    df = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
+    for i in range(5):
+        _write_file(df, tmp_path / f"{i}.{file_type}", file_type)
+    assert get_file_column_names(f"{tmp_path}/*.{file_type}", file_type) == ["col1", "col2"]
+
+
+def test_get_file_column_names_with_glob_pattern_error(tmp_path):
+    df = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
+    for i in range(5):
+        _write_file(df, tmp_path / f"{i}.parquet", "parquet")
+    with pytest.raises(InvalidFilePathError, match="No files found matching pattern"):
+        get_file_column_names(f"{tmp_path}/*.csv", "csv")
+
+
 def test_get_file_column_names_error_handling():
     with pytest.raises(InvalidFilePathError, match="🛑 Unsupported file type: 'txt'"):
         get_file_column_names("test.txt", "txt")
diff --git a/tests/config/test_seed.py b/tests/config/test_seed.py
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from pathlib import Path
-import tempfile
 
 import pandas as pd
 import pytest
@@ -77,33 +76,26 @@ def test_partition_block_to_index_range():
     assert index_range.size == 15
 
 
-def test_local_seed_dataset_reference_validation():
+def test_local_seed_dataset_reference_validation(tmp_path: Path):
     with pytest.raises(InvalidFilePathError, match="🛑 Path test/dataset.parquet is not a file."):
         LocalSeedDatasetReference(dataset="test/dataset.parquet")
 
     # Should not raise an error when referencing supported extensions with wildcard pattern.
-    with tempfile.TemporaryDirectory() as temp_dir:
-        create_partitions_in_path(Path(temp_dir), "parquet")
-        create_partitions_in_path(Path(temp_dir), "csv")
-        create_partitions_in_path(Path(temp_dir), "json")
-        create_partitions_in_path(Path(temp_dir), "jsonl")
-
-        test_cases = [
-            (temp_dir, "parquet"),
-            (temp_dir, "csv"),
-            (temp_dir, "json"),
-            (temp_dir, "jsonl"),
-        ]
-
-        try:
-            for temp_dir, extension in test_cases:
-                reference = LocalSeedDatasetReference(dataset=f"{temp_dir}/*.{extension}")
-                assert reference.dataset == f"{temp_dir}/*.{extension}"
-        except Exception as e:
-            pytest.fail(f"Expected no exception, but got {e}")
-
-    # Should raise an error when referencing a path that does not contain files of the specified type.
-    with tempfile.TemporaryDirectory() as temp_dir:
-        create_partitions_in_path(Path(temp_dir), "parquet")
-        with pytest.raises(InvalidFilePathError, match="does not contain files of type 'csv'"):
-            LocalSeedDatasetReference(dataset=f"{temp_dir}/*.csv")
+    create_partitions_in_path(tmp_path, "parquet")
+    create_partitions_in_path(tmp_path, "csv")
+    create_partitions_in_path(tmp_path, "json")
+    create_partitions_in_path(tmp_path, "jsonl")
+
+    test_cases = ["parquet", "csv", "json", "jsonl"]
+    try:
+        for extension in test_cases:
+            reference = LocalSeedDatasetReference(dataset=f"{tmp_path}/*.{extension}")
+            assert reference.dataset == f"{tmp_path}/*.{extension}"
+    except Exception as e:
+        pytest.fail(f"Expected no exception, but got {e}")
+
+
+def test_local_seed_dataset_reference_validation_error(tmp_path: Path):
+    create_partitions_in_path(tmp_path, "parquet")
+    with pytest.raises(InvalidFilePathError, match="does not contain files of type 'csv'"):
+        LocalSeedDatasetReference(dataset=f"{tmp_path}/*.csv")