Skip to content

Commit 40063e1

Browse files
committed
fix tests
1 parent 8ae7b83 commit 40063e1

File tree

1 file changed

+7
-37
lines changed

1 file changed

+7
-37
lines changed

tests/fast/api/test_to_parquet.py

Lines changed: 7 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -226,54 +226,24 @@ def test_filename_pattern_with_uuid(self, pd):
226226
expected = [("rei", 321.0, "a"), ("shinji", 123.0, "a"), ("asuka", 23.0, "b"), ("kaworu", 340.0, "c")]
227227
assert result.execute().fetchall() == expected
228228

229-
@pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()])
230229
@pytest.mark.parametrize("file_size_bytes", [1000, "1k"])
231-
def test_file_size_bytes_basic(self, pd, file_size_bytes):
230+
def test_file_size_bytes_basic(self, file_size_bytes):
232231
temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118
233-
# Create a larger dataset to ensure multiple files are created
234-
df = pd.DataFrame(
235-
{
236-
"name": [f"name_{i}" for i in range(100)],
237-
"value": [i * 100.0 for i in range(100)],
238-
"description": [f"description_{i}_with_more_text" for i in range(100)],
239-
}
240-
)
241-
rel = duckdb.from_df(df)
242-
rel.to_parquet(temp_file_name, file_size_bytes=file_size_bytes, per_thread_output=True)
232+
233+
# use same test data as external/duckdb/test/sql/copy/file_size_bytes.test
234+
rel = duckdb.from_query("SELECT i AS col_a, i AS col_b FROM range(0,10000) tbl(i);")
235+
rel.to_parquet(temp_file_name, file_size_bytes=file_size_bytes, row_group_size=2000)
243236

244237
# Check that multiple files were created
245238
files = list(pathlib.Path(temp_file_name).iterdir())
246239
assert len(files) > 1, f"Expected multiple files, got {len(files)}"
247240

248241
# Verify data integrity
249242
result = duckdb.read_parquet(f"{temp_file_name}/*.parquet")
250-
assert len(result.execute().fetchall()) == 100
251-
252-
@pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()])
253-
def test_file_size_bytes_with_partition(self, pd):
254-
temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118
255-
# Create a dataset with enough data to trigger file splitting
256-
df = pd.DataFrame(
257-
{
258-
"name": [f"name_{i}" for i in range(100)],
259-
"value": [i * 100.0 for i in range(100)],
260-
"category": ["a" if i < 50 else "b" for i in range(100)],
261-
"description": [f"description_{i}_with_more_text_to_increase_size" for i in range(100)],
262-
}
263-
)
264-
rel = duckdb.from_df(df)
265-
rel.to_parquet(temp_file_name, partition_by=["category"], file_size_bytes="2k", per_thread_output=True)
266-
267-
# Check that files were created in partition directories
268-
assert pathlib.Path(f"{temp_file_name}/category=a").exists()
269-
assert pathlib.Path(f"{temp_file_name}/category=b").exists()
270-
271-
# Verify data integrity
272-
result = duckdb.sql(f"FROM read_parquet('{temp_file_name}/*/*.parquet', hive_partitioning=TRUE)")
273-
assert len(result.execute().fetchall()) == 100
243+
assert len(result.execute().fetchall()) == 10000
274244

275245
@pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()])
276-
@pytest.mark.parametrize("file_size_bytes", ["1M", "1G"])
246+
@pytest.mark.parametrize("file_size_bytes", ["256MB", "1G"])
277247
def test_file_size_bytes_human_readable(self, pd, file_size_bytes):
278248
temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118
279249
df = pd.DataFrame(

0 commit comments

Comments
 (0)