fix: update get_file_column_names to take a file reference (#68)

johnnygreco · web-flow · commit 2e9e4ff0ee3a · 2025-11-24T16:38:52.000-05:00
* make get_file_column_names take explicit file reference

* add skip_instance_cache back

* add hf filesystem logic

* update docstring
diff --git a/src/data_designer/config/datastore.py b/src/data_designer/config/datastore.py
@@ -31,47 +31,74 @@ class DatastoreSettings(BaseModel):
     token: Optional[str] = Field(default=None, description="If needed, token to use for authentication.")
 
 
-def get_file_column_names(file_path: Union[str, Path], file_type: str) -> list[str]:
-    """Extract column names based on file type. Supports glob patterns like '../path/*.parquet'."""
-    file_path = Path(file_path)
-    if "*" in str(file_path):
-        matching_files = sorted(file_path.parent.glob(file_path.name))
-        if not matching_files:
-            raise InvalidFilePathError(f"🛑 No files found matching pattern: {str(file_path)!r}")
-        logger.debug(f"0️⃣ Using the first matching file in {str(file_path)!r} to determine column names in seed dataset")
-        file_path = matching_files[0]
+def get_file_column_names(file_reference: Union[str, Path, HfFileSystem], file_type: str) -> list[str]:
+    """Get column names from a dataset file.
+
+    Args:
+        file_reference: Path to the dataset file, or an HfFileSystem object.
+        file_type: Type of the dataset file. Must be one of: 'parquet', 'json', 'jsonl', 'csv'.
 
+    Raises:
+        InvalidFilePathError: If the file type is not supported.
+
+    Returns:
+        List of column names.
+    """
     if file_type == "parquet":
         try:
-            schema = pq.read_schema(file_path)
+            schema = pq.read_schema(file_reference)
             if hasattr(schema, "names"):
                 return schema.names
             else:
                 return [field.name for field in schema]
         except Exception as e:
-            logger.warning(f"Failed to process parquet file {file_path}: {e}")
+            logger.warning(f"Failed to process parquet file {file_reference}: {e}")
             return []
     elif file_type in ["json", "jsonl"]:
-        return pd.read_json(file_path, orient="records", lines=True, nrows=1).columns.tolist()
+        return pd.read_json(file_reference, orient="records", lines=True, nrows=1).columns.tolist()
     elif file_type == "csv":
         try:
-            df = pd.read_csv(file_path, nrows=1)
+            df = pd.read_csv(file_reference, nrows=1)
             return df.columns.tolist()
         except (pd.errors.EmptyDataError, pd.errors.ParserError) as e:
-            logger.warning(f"Failed to process CSV file {file_path}: {e}")
+            logger.warning(f"Failed to process CSV file {file_reference}: {e}")
             return []
     else:
         raise InvalidFilePathError(f"🛑 Unsupported file type: {file_type!r}")
 
 
 def fetch_seed_dataset_column_names(seed_dataset_reference: SeedDatasetReference) -> list[str]:
     if hasattr(seed_dataset_reference, "datastore_settings"):
-        return _fetch_seed_dataset_column_names_from_datastore(
+        return fetch_seed_dataset_column_names_from_datastore(
             seed_dataset_reference.repo_id,
             seed_dataset_reference.filename,
             seed_dataset_reference.datastore_settings,
         )
-    return _fetch_seed_dataset_column_names_from_local_file(seed_dataset_reference.dataset)
+    return fetch_seed_dataset_column_names_from_local_file(seed_dataset_reference.dataset)
+
+
+def fetch_seed_dataset_column_names_from_datastore(
+    repo_id: str,
+    filename: str,
+    datastore_settings: Optional[Union[DatastoreSettings, dict]] = None,
+) -> list[str]:
+    file_type = filename.split(".")[-1]
+    if f".{file_type}" not in VALID_DATASET_FILE_EXTENSIONS:
+        raise InvalidFileFormatError(f"🛑 Unsupported file type: {filename!r}")
+
+    datastore_settings = resolve_datastore_settings(datastore_settings)
+    fs = HfFileSystem(endpoint=datastore_settings.endpoint, token=datastore_settings.token, skip_instance_cache=True)
+
+    file_path = _extract_single_file_path_from_glob_pattern_if_present(f"datasets/{repo_id}/{filename}", fs=fs)
+
+    with fs.open(file_path) as f:
+        return get_file_column_names(f, file_type)
+
+
+def fetch_seed_dataset_column_names_from_local_file(dataset_path: str | Path) -> list[str]:
+    dataset_path = _validate_dataset_path(dataset_path, allow_glob_pattern=True)
+    dataset_path = _extract_single_file_path_from_glob_pattern_if_present(dataset_path)
+    return get_file_column_names(dataset_path, str(dataset_path).split(".")[-1])
 
 
 def resolve_datastore_settings(datastore_settings: DatastoreSettings | dict | None) -> DatastoreSettings:
@@ -114,25 +141,34 @@ def upload_to_hf_hub(
     return f"{repo_id}/{filename}"
 
 
-def _fetch_seed_dataset_column_names_from_datastore(
-    repo_id: str,
-    filename: str,
-    datastore_settings: Optional[Union[DatastoreSettings, dict]] = None,
-) -> list[str]:
-    file_type = filename.split(".")[-1]
-    if f".{file_type}" not in VALID_DATASET_FILE_EXTENSIONS:
-        raise InvalidFileFormatError(f"🛑 Unsupported file type: {filename!r}")
-
-    datastore_settings = resolve_datastore_settings(datastore_settings)
-    fs = HfFileSystem(endpoint=datastore_settings.endpoint, token=datastore_settings.token, skip_instance_cache=True)
-
-    with fs.open(f"datasets/{repo_id}/{filename}") as f:
-        return get_file_column_names(f, file_type)
-
+def _extract_single_file_path_from_glob_pattern_if_present(
+    file_path: str | Path,
+    fs: HfFileSystem | None = None,
+) -> Path:
+    file_path = Path(file_path)
 
-def _fetch_seed_dataset_column_names_from_local_file(dataset_path: str | Path) -> list[str]:
-    dataset_path = _validate_dataset_path(dataset_path, allow_glob_pattern=True)
-    return get_file_column_names(dataset_path, str(dataset_path).split(".")[-1])
+    # no glob pattern
+    if "*" not in str(file_path):
+        return file_path
+
+    # glob pattern with HfFileSystem
+    if fs is not None:
+        file_to_check = None
+        file_extension = file_path.name.split(".")[-1]
+        for file in fs.ls(str(file_path.parent)):
+            filename = file["name"]
+            if filename.endswith(f".{file_extension}"):
+                file_to_check = filename
+        if file_to_check is None:
+            raise InvalidFilePathError(f"🛑 No files found matching pattern: {str(file_path)!r}")
+        logger.debug(f"Using the first matching file in {str(file_path)!r} to determine column names in seed dataset")
+        return Path(file_to_check)
+
+    # glob pattern with local file system
+    if not (matching_files := sorted(file_path.parent.glob(file_path.name))):
+        raise InvalidFilePathError(f"🛑 No files found matching pattern: {str(file_path)!r}")
+    logger.debug(f"Using the first matching file in {str(file_path)!r} to determine column names in seed dataset")
+    return matching_files[0]
 
 
 def _validate_dataset_path(dataset_path: Union[str, Path], allow_glob_pattern: bool = False) -> Path:
diff --git a/tests/config/test_datastore.py b/tests/config/test_datastore.py
@@ -1,7 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-from pathlib import Path
 from unittest.mock import MagicMock, patch
 
 import numpy as np
@@ -13,6 +12,7 @@
 from data_designer.config.datastore import (
     DatastoreSettings,
     fetch_seed_dataset_column_names,
+    fetch_seed_dataset_column_names_from_local_file,
     get_file_column_names,
     resolve_datastore_settings,
     upload_to_hf_hub,
@@ -127,22 +127,6 @@ def test_get_file_column_names_unicode(tmp_path, file_type):
     assert get_file_column_names(str(unicode_path), file_type) == df_unicode.columns.tolist()
 
 
-@pytest.mark.parametrize("file_type", ["parquet", "csv", "json", "jsonl"])
-def test_get_file_column_names_with_glob_pattern(tmp_path, file_type):
-    df = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
-    for i in range(5):
-        _write_file(df, tmp_path / f"{i}.{file_type}", file_type)
-    assert get_file_column_names(f"{tmp_path}/*.{file_type}", file_type) == ["col1", "col2"]
-
-
-def test_get_file_column_names_with_glob_pattern_error(tmp_path):
-    df = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
-    for i in range(5):
-        _write_file(df, tmp_path / f"{i}.parquet", "parquet")
-    with pytest.raises(InvalidFilePathError, match="No files found matching pattern"):
-        get_file_column_names(f"{tmp_path}/*.csv", "csv")
-
-
 def test_get_file_column_names_with_filesystem_parquet():
     """Test get_file_column_names with filesystem parameter for parquet files."""
     mock_schema = MagicMock()
@@ -153,7 +137,7 @@ def test_get_file_column_names_with_filesystem_parquet():
         result = get_file_column_names("datasets/test/file.parquet", "parquet")
 
         assert result == ["col1", "col2", "col3"]
-        mock_read_schema.assert_called_once_with(Path("datasets/test/file.parquet"))
+        mock_read_schema.assert_called_once_with("datasets/test/file.parquet")
 
 
 @pytest.mark.parametrize("file_type", ["json", "jsonl", "csv"])
@@ -274,3 +258,29 @@ def test_upload_to_hf_hub_error_handling(datastore_settings):
         with patch("data_designer.config.datastore.Path.is_file", autospec=True) as mock_is_file:
             mock_is_file.return_value = True
             upload_to_hf_hub("test.text", "test.txt", "test/repo", datastore_settings)
+
+
+@pytest.mark.parametrize("file_type", ["parquet", "json", "jsonl", "csv"])
+def test_fetch_seed_dataset_column_names_from_local_file_with_glob(tmp_path, file_type):
+    """Test fetch_seed_dataset_column_names_from_local_file with glob pattern matching multiple files."""
+    test_data = pd.DataFrame({"col1": [1, 2], "col2": [3, 4], "col3": [5, 6]})
+
+    # Create multiple files with the same schema
+    for i in range(3):
+        file_path = tmp_path / f"data_{i}.{file_type}"
+        _write_file(test_data, file_path, file_type)
+
+    # Test glob pattern that matches all files
+    glob_pattern = str(tmp_path / f"*.{file_type}")
+    result = fetch_seed_dataset_column_names_from_local_file(glob_pattern)
+
+    assert result == ["col1", "col2", "col3"]
+
+
+@pytest.mark.parametrize("file_type", ["parquet", "csv"])
+def test_fetch_seed_dataset_column_names_from_local_file_with_glob_no_matches(tmp_path, file_type):
+    """Test fetch_seed_dataset_column_names_from_local_file with glob pattern that matches no files."""
+    glob_pattern = str(tmp_path / f"nonexistent_*.{file_type}")
+
+    with pytest.raises(InvalidFilePathError, match="does not contain files of type"):
+        fetch_seed_dataset_column_names_from_local_file(glob_pattern)