Skip to content

Commit 8ae7b83

Browse files
committed
add file_size_bytes, wasn't able to run the tests locally.
1 parent b82d1a5 commit 8ae7b83

File tree

5 files changed

+81
-3
lines changed

5 files changed

+81
-3
lines changed

_duckdb-stubs/__init__.pyi

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -721,6 +721,7 @@ class DuckDBPyRelation:
721721
write_partition_columns: bool | None = None,
722722
append: bool | None = None,
723723
filename_pattern: str | None = None,
724+
file_size_bytes: str | int | None = None,
724725
) -> None: ...
725726
def to_table(self, table_name: str) -> None: ...
726727
def to_view(self, view_name: str, replace: bool = True) -> DuckDBPyRelation: ...
@@ -774,6 +775,7 @@ class DuckDBPyRelation:
774775
write_partition_columns: bool | None = None,
775776
append: bool | None = None,
776777
filename_pattern: str | None = None,
778+
file_size_bytes: str | int | None = None,
777779
) -> None: ...
778780
@property
779781
def alias(self) -> str: ...

src/duckdb_py/include/duckdb_python/pyrelation.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,8 @@ struct DuckDBPyRelation {
214214
const py::object &row_group_size = py::none(), const py::object &overwrite = py::none(),
215215
const py::object &per_thread_output = py::none(), const py::object &use_tmp_file = py::none(),
216216
const py::object &partition_by = py::none(), const py::object &write_partition_columns = py::none(),
217-
const py::object &append = py::none(), const py::object &filename_pattern = py::none());
217+
const py::object &append = py::none(), const py::object &filename_pattern = py::none(),
218+
const py::object &file_size_bytes = py::none());
218219

219220
void ToCSV(const string &filename, const py::object &sep = py::none(), const py::object &na_rep = py::none(),
220221
const py::object &header = py::none(), const py::object &quotechar = py::none(),

src/duckdb_py/pyrelation.cpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1214,7 +1214,7 @@ void DuckDBPyRelation::ToParquet(const string &filename, const py::object &compr
12141214
const py::object &overwrite, const py::object &per_thread_output,
12151215
const py::object &use_tmp_file, const py::object &partition_by,
12161216
const py::object &write_partition_columns, const py::object &append,
1217-
const py::object &filename_pattern) {
1217+
const py::object &filename_pattern, const py::object &file_size_bytes) {
12181218
case_insensitive_map_t<vector<Value>> options;
12191219

12201220
if (!py::none().is(compression)) {
@@ -1312,6 +1312,17 @@ void DuckDBPyRelation::ToParquet(const string &filename, const py::object &compr
13121312
options["filename_pattern"] = {Value(py::str(filename_pattern))};
13131313
}
13141314

1315+
if (!py::none().is(file_size_bytes)) {
1316+
if (py::isinstance<py::int_>(file_size_bytes)) {
1317+
int64_t file_size_bytes_int = py::int_(file_size_bytes);
1318+
options["file_size_bytes"] = {Value(file_size_bytes_int)};
1319+
} else if (py::isinstance<py::str>(file_size_bytes)) {
1320+
options["file_size_bytes"] = {Value(py::str(file_size_bytes))};
1321+
} else {
1322+
throw InvalidInputException("to_parquet only accepts 'file_size_bytes' as an integer or string");
1323+
}
1324+
}
1325+
13151326
auto write_parquet = rel->WriteParquetRel(filename, std::move(options));
13161327
PyExecuteRelation(write_parquet);
13171328
}

src/duckdb_py/pyrelation/initialize.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ static void InitializeConsumers(py::class_<DuckDBPyRelation> &m) {
3737
py::arg("overwrite") = py::none(), py::arg("per_thread_output") = py::none(),
3838
py::arg("use_tmp_file") = py::none(), py::arg("partition_by") = py::none(),
3939
py::arg("write_partition_columns") = py::none(), py::arg("append") = py::none(),
40-
py::arg("filename_pattern") = py::none());
40+
py::arg("filename_pattern") = py::none(), py::arg("file_size_bytes") = py::none());
4141

4242
DefineMethod(
4343
{"to_csv", "write_csv"}, m, &DuckDBPyRelation::ToCSV, "Write the relation object to a CSV file in 'file_name'",

tests/fast/api/test_to_parquet.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,3 +225,67 @@ def test_filename_pattern_with_uuid(self, pd):
225225
result = duckdb.sql(f"FROM read_parquet('{temp_file_name}/*/*.parquet', hive_partitioning=TRUE)")
226226
expected = [("rei", 321.0, "a"), ("shinji", 123.0, "a"), ("asuka", 23.0, "b"), ("kaworu", 340.0, "c")]
227227
assert result.execute().fetchall() == expected
228+
229+
@pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()])
230+
@pytest.mark.parametrize("file_size_bytes", [1000, "1k"])
231+
def test_file_size_bytes_basic(self, pd, file_size_bytes):
232+
temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118
233+
# Create a larger dataset to ensure multiple files are created
234+
df = pd.DataFrame(
235+
{
236+
"name": [f"name_{i}" for i in range(100)],
237+
"value": [i * 100.0 for i in range(100)],
238+
"description": [f"description_{i}_with_more_text" for i in range(100)],
239+
}
240+
)
241+
rel = duckdb.from_df(df)
242+
rel.to_parquet(temp_file_name, file_size_bytes=file_size_bytes, per_thread_output=True)
243+
244+
# Check that multiple files were created
245+
files = list(pathlib.Path(temp_file_name).iterdir())
246+
assert len(files) > 1, f"Expected multiple files, got {len(files)}"
247+
248+
# Verify data integrity
249+
result = duckdb.read_parquet(f"{temp_file_name}/*.parquet")
250+
assert len(result.execute().fetchall()) == 100
251+
252+
@pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()])
253+
def test_file_size_bytes_with_partition(self, pd):
254+
temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118
255+
# Create a dataset with enough data to trigger file splitting
256+
df = pd.DataFrame(
257+
{
258+
"name": [f"name_{i}" for i in range(100)],
259+
"value": [i * 100.0 for i in range(100)],
260+
"category": ["a" if i < 50 else "b" for i in range(100)],
261+
"description": [f"description_{i}_with_more_text_to_increase_size" for i in range(100)],
262+
}
263+
)
264+
rel = duckdb.from_df(df)
265+
rel.to_parquet(temp_file_name, partition_by=["category"], file_size_bytes="2k", per_thread_output=True)
266+
267+
# Check that files were created in partition directories
268+
assert pathlib.Path(f"{temp_file_name}/category=a").exists()
269+
assert pathlib.Path(f"{temp_file_name}/category=b").exists()
270+
271+
# Verify data integrity
272+
result = duckdb.sql(f"FROM read_parquet('{temp_file_name}/*/*.parquet', hive_partitioning=TRUE)")
273+
assert len(result.execute().fetchall()) == 100
274+
275+
@pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()])
276+
@pytest.mark.parametrize("file_size_bytes", ["1M", "1G"])
277+
def test_file_size_bytes_human_readable(self, pd, file_size_bytes):
278+
temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118
279+
df = pd.DataFrame(
280+
{
281+
"name": ["rei", "shinji", "asuka", "kaworu"],
282+
"float": [321.0, 123.0, 23.0, 340.0],
283+
"category": ["a", "a", "b", "c"],
284+
}
285+
)
286+
rel = duckdb.from_df(df)
287+
rel.to_parquet(temp_file_name, file_size_bytes=file_size_bytes)
288+
289+
# With large file size limits, should create just one file
290+
parquet_rel = duckdb.read_parquet(temp_file_name)
291+
assert rel.execute().fetchall() == parquet_rel.execute().fetchall()

0 commit comments

Comments
 (0)