Skip to content

Commit 34891cf

Browse files
chore: Fix for No magic bytes and 416 Requested Range Not Satisfiable errors (#57)
* Fix for `No magic bytes` and `416 Requested Range Not Satisfiable` errors * move import
1 parent 21ec195 commit 34891cf

File tree

3 files changed

+65
-7
lines changed

3 files changed

+65
-7
lines changed

src/data_designer/config/datastore.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ def _fetch_seed_dataset_column_names_from_datastore(
124124
raise InvalidFileFormatError(f"🛑 Unsupported file type: {filename!r}")
125125

126126
datastore_settings = resolve_datastore_settings(datastore_settings)
127-
fs = HfFileSystem(endpoint=datastore_settings.endpoint, token=datastore_settings.token)
127+
fs = HfFileSystem(endpoint=datastore_settings.endpoint, token=datastore_settings.token, skip_instance_cache=True)
128128

129129
with fs.open(f"datasets/{repo_id}/{filename}") as f:
130130
return get_file_column_names(f, file_type)

src/data_designer/engine/resources/seed_dataset_data_store.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,29 @@ class HfHubSeedDatasetDataStore(SeedDatasetDataStore):
4242

4343
def __init__(self, endpoint: str, token: str | None):
4444
self.hfapi = HfApi(endpoint=endpoint, token=token)
45-
self.hffs = HfFileSystem(endpoint=endpoint, token=token)
45+
self.endpoint = endpoint
46+
self.token = token
4647

4748
def create_duckdb_connection(self) -> duckdb.DuckDBPyConnection:
49+
"""Create a DuckDB connection with a fresh HfFileSystem registered.
50+
51+
Creates a new HfFileSystem instance for each connection to ensure file metadata
52+
is fetched fresh from the datastore, avoiding cache-related issues when reading
53+
recently updated parquet files.
54+
55+
Returns:
56+
A DuckDB connection with the HfFileSystem registered for hf:// URI support.
57+
"""
58+
# Use skip_instance_cache to avoid fsspec-level caching
59+
hffs = HfFileSystem(endpoint=self.endpoint, token=self.token, skip_instance_cache=True)
60+
61+
# Clear all internal caches to avoid stale metadata issues
62+
# HfFileSystem caches file metadata (size, etc.) which can become stale when files are re-uploaded
63+
if hasattr(hffs, "dircache"):
64+
hffs.dircache.clear()
65+
4866
conn = duckdb.connect()
49-
conn.register_filesystem(self.hffs)
67+
conn.register_filesystem(hffs)
5068
return conn
5169

5270
def get_dataset_uri(self, file_id: str) -> str:

tests/config/test_datastore.py

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: Apache-2.0
33

4+
from pathlib import Path
45
from unittest.mock import MagicMock, patch
56

67
import numpy as np
@@ -142,6 +143,36 @@ def test_get_file_column_names_with_glob_pattern_error(tmp_path):
142143
get_file_column_names(f"{tmp_path}/*.csv", "csv")
143144

144145

146+
def test_get_file_column_names_with_filesystem_parquet():
147+
"""Test get_file_column_names with filesystem parameter for parquet files."""
148+
mock_schema = MagicMock()
149+
mock_schema.names = ["col1", "col2", "col3"]
150+
151+
with patch("data_designer.config.datastore.pq.read_schema") as mock_read_schema:
152+
mock_read_schema.return_value = mock_schema
153+
result = get_file_column_names("datasets/test/file.parquet", "parquet")
154+
155+
assert result == ["col1", "col2", "col3"]
156+
mock_read_schema.assert_called_once_with(Path("datasets/test/file.parquet"))
157+
158+
159+
@pytest.mark.parametrize("file_type", ["json", "jsonl", "csv"])
160+
def test_get_file_column_names_with_filesystem_non_parquet(tmp_path, file_type):
161+
"""Test get_file_column_names with file-like objects for non-parquet files."""
162+
test_data = pd.DataFrame({"col1": [1], "col2": [2], "col3": [3]})
163+
164+
# Create a real temporary file
165+
file_path = tmp_path / f"test_file.{file_type}"
166+
if file_type in ["json", "jsonl"]:
167+
test_data.to_json(file_path, orient="records", lines=True)
168+
else:
169+
test_data.to_csv(file_path, index=False)
170+
171+
result = get_file_column_names(str(file_path), file_type)
172+
173+
assert result == ["col1", "col2", "col3"]
174+
175+
145176
def test_get_file_column_names_error_handling():
146177
with pytest.raises(InvalidFilePathError, match="🛑 Unsupported file type: 'txt'"):
147178
get_file_column_names("test.txt", "txt")
@@ -177,20 +208,29 @@ def test_fetch_seed_dataset_column_names_local_file(mock_get_file_column_names,
177208
assert fetch_seed_dataset_column_names(LocalSeedDatasetReference(dataset="test.parquet")) == ["col1", "col2"]
178209

179210

180-
@patch("data_designer.config.datastore.HfFileSystem.open")
211+
@patch("data_designer.config.datastore.HfFileSystem")
181212
@patch("data_designer.config.datastore.get_file_column_names", autospec=True)
182-
def test_fetch_seed_dataset_column_names_remote_file(mock_get_file_column_names, mock_hf_fs_open, datastore_settings):
213+
def test_fetch_seed_dataset_column_names_remote_file(mock_get_file_column_names, mock_hf_fs, datastore_settings):
183214
mock_get_file_column_names.return_value = ["col1", "col2"]
215+
mock_fs_instance = MagicMock()
216+
mock_hf_fs.return_value = mock_fs_instance
217+
184218
assert fetch_seed_dataset_column_names(
185219
DatastoreSeedDatasetReference(
186220
dataset="test/repo/test.parquet",
187221
datastore_settings=datastore_settings,
188222
)
189223
) == ["col1", "col2"]
190-
mock_hf_fs_open.assert_called_once_with(
191-
"datasets/test/repo/test.parquet",
224+
225+
mock_hf_fs.assert_called_once_with(
226+
endpoint=datastore_settings.endpoint, token=datastore_settings.token, skip_instance_cache=True
192227
)
193228

229+
# The get_file_column_names is called with a file-like object from fs.open()
230+
assert mock_get_file_column_names.call_count == 1
231+
call_args = mock_get_file_column_names.call_args
232+
assert call_args[0][1] == "parquet"
233+
194234

195235
def test_resolve_datastore_settings(datastore_settings):
196236
with pytest.raises(InvalidConfigError, match="Datastore settings are required"):

0 commit comments

Comments
 (0)