Skip to content

Commit f21ac50

Browse files
committed
feat: add file_size_bytes to to_parquet
1 parent f2b5da9 commit f21ac50

File tree

6 files changed

+55
-7
lines changed

6 files changed

+55
-7
lines changed

CONTRIBUTING.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
# Contributing to duckdb-python
22

3+
## Setting up a development environment
4+
5+
See the [instructions on duckdb.org](https://duckdb.org/docs/stable/dev/building/python).
6+
37
## General Guidelines
48

59
### **Did you find a bug?**
@@ -39,7 +43,3 @@
3943
### Testing cross-platform and cross-Python
4044

4145
* On your fork you can [run](https://docs.github.com/en/actions/using-workflows/manually-running-a-workflow#running-a-workflow) the Packaging workflow manually for any branch. You can choose whether to build for all platforms or a subset, and to either run the full testsuite, the fast tests only, or no tests at all.
42-
43-
## Setting up a development environment
44-
45-
See the [instructions on duckdb.org](https://duckdb.org/docs/stable/dev/building/python).

_duckdb-stubs/__init__.pyi

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -721,6 +721,7 @@ class DuckDBPyRelation:
721721
write_partition_columns: bool | None = None,
722722
append: bool | None = None,
723723
filename_pattern: str | None = None,
724+
file_size_bytes: str | int | None = None,
724725
) -> None: ...
725726
def to_table(self, table_name: str) -> None: ...
726727
def to_view(self, view_name: str, replace: bool = True) -> DuckDBPyRelation: ...
@@ -774,6 +775,7 @@ class DuckDBPyRelation:
774775
write_partition_columns: bool | None = None,
775776
append: bool | None = None,
776777
filename_pattern: str | None = None,
778+
file_size_bytes: str | int | None = None,
777779
) -> None: ...
778780
@property
779781
def alias(self) -> str: ...

src/duckdb_py/include/duckdb_python/pyrelation.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,8 @@ struct DuckDBPyRelation {
214214
const py::object &row_group_size = py::none(), const py::object &overwrite = py::none(),
215215
const py::object &per_thread_output = py::none(), const py::object &use_tmp_file = py::none(),
216216
const py::object &partition_by = py::none(), const py::object &write_partition_columns = py::none(),
217-
const py::object &append = py::none(), const py::object &filename_pattern = py::none());
217+
const py::object &append = py::none(), const py::object &filename_pattern = py::none(),
218+
const py::object &file_size_bytes = py::none());
218219

219220
void ToCSV(const string &filename, const py::object &sep = py::none(), const py::object &na_rep = py::none(),
220221
const py::object &header = py::none(), const py::object &quotechar = py::none(),

src/duckdb_py/pyrelation.cpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1214,7 +1214,7 @@ void DuckDBPyRelation::ToParquet(const string &filename, const py::object &compr
12141214
const py::object &overwrite, const py::object &per_thread_output,
12151215
const py::object &use_tmp_file, const py::object &partition_by,
12161216
const py::object &write_partition_columns, const py::object &append,
1217-
const py::object &filename_pattern) {
1217+
const py::object &filename_pattern, const py::object &file_size_bytes) {
12181218
case_insensitive_map_t<vector<Value>> options;
12191219

12201220
if (!py::none().is(compression)) {
@@ -1312,6 +1312,17 @@ void DuckDBPyRelation::ToParquet(const string &filename, const py::object &compr
13121312
options["filename_pattern"] = {Value(py::str(filename_pattern))};
13131313
}
13141314

1315+
if (!py::none().is(file_size_bytes)) {
1316+
if (py::isinstance<py::int_>(file_size_bytes)) {
1317+
int64_t file_size_bytes_int = py::int_(file_size_bytes);
1318+
options["file_size_bytes"] = {Value(file_size_bytes_int)};
1319+
} else if (py::isinstance<py::str>(file_size_bytes)) {
1320+
options["file_size_bytes"] = {Value(py::str(file_size_bytes))};
1321+
} else {
1322+
throw InvalidInputException("to_parquet only accepts 'file_size_bytes' as an integer or string");
1323+
}
1324+
}
1325+
13151326
auto write_parquet = rel->WriteParquetRel(filename, std::move(options));
13161327
PyExecuteRelation(write_parquet);
13171328
}

src/duckdb_py/pyrelation/initialize.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ static void InitializeConsumers(py::class_<DuckDBPyRelation> &m) {
3737
py::arg("overwrite") = py::none(), py::arg("per_thread_output") = py::none(),
3838
py::arg("use_tmp_file") = py::none(), py::arg("partition_by") = py::none(),
3939
py::arg("write_partition_columns") = py::none(), py::arg("append") = py::none(),
40-
py::arg("filename_pattern") = py::none());
40+
py::arg("filename_pattern") = py::none(), py::arg("file_size_bytes") = py::none());
4141

4242
DefineMethod(
4343
{"to_csv", "write_csv"}, m, &DuckDBPyRelation::ToCSV, "Write the relation object to a CSV file in 'file_name'",

tests/fast/api/test_to_parquet.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,3 +225,37 @@ def test_filename_pattern_with_uuid(self, pd):
225225
result = duckdb.sql(f"FROM read_parquet('{temp_file_name}/*/*.parquet', hive_partitioning=TRUE)")
226226
expected = [("rei", 321.0, "a"), ("shinji", 123.0, "a"), ("asuka", 23.0, "b"), ("kaworu", 340.0, "c")]
227227
assert result.execute().fetchall() == expected
228+
229+
@pytest.mark.parametrize("file_size_bytes", [1000, "1k"])
230+
def test_file_size_bytes_basic(self, file_size_bytes):
231+
temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118
232+
233+
# use same test data as external/duckdb/test/sql/copy/file_size_bytes.test
234+
rel = duckdb.from_query("SELECT i AS col_a, i AS col_b FROM range(0,10000) tbl(i);")
235+
rel.to_parquet(temp_file_name, file_size_bytes=file_size_bytes, row_group_size=2000)
236+
237+
# Check that multiple files were created
238+
files = list(pathlib.Path(temp_file_name).iterdir())
239+
assert len(files) > 1, f"Expected multiple files, got {len(files)}"
240+
241+
# Verify data integrity
242+
result = duckdb.read_parquet(f"{temp_file_name}/*.parquet")
243+
assert len(result.execute().fetchall()) == 10000
244+
245+
@pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()])
246+
@pytest.mark.parametrize("file_size_bytes", ["256MB", "1G"])
247+
def test_file_size_bytes_human_readable(self, pd, file_size_bytes):
248+
temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118
249+
df = pd.DataFrame(
250+
{
251+
"name": ["rei", "shinji", "asuka", "kaworu"],
252+
"float": [321.0, 123.0, 23.0, 340.0],
253+
"category": ["a", "a", "b", "c"],
254+
}
255+
)
256+
rel = duckdb.from_df(df)
257+
rel.to_parquet(temp_file_name, file_size_bytes=file_size_bytes)
258+
259+
# With large file size limits, should create just one file
260+
parquet_rel = duckdb.read_parquet(temp_file_name)
261+
assert rel.execute().fetchall() == parquet_rel.execute().fetchall()

0 commit comments

Comments
 (0)