feat: add file_size_bytes to to_parquet

nicornk · nicornk · commit f21ac50db096 · 2025-12-07T21:02:46.000+01:00
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -1,5 +1,9 @@
 # Contributing to duckdb-python
 
+## Setting up a development environment
+
+See the [instructions on duckdb.org](https://duckdb.org/docs/stable/dev/building/python).
+
 ## General Guidelines
 
 ### **Did you find a bug?**
@@ -39,7 +43,3 @@
 ### Testing cross-platform and cross-Python
 
 * On your fork you can [run](https://docs.github.com/en/actions/using-workflows/manually-running-a-workflow#running-a-workflow) the Packaging workflow manually for any branch. You can choose whether to build for all platforms or a subset, and to either run the full testsuite, the fast tests only, or no tests at all.
-
-## Setting up a development environment
-
-See the [instructions on duckdb.org](https://duckdb.org/docs/stable/dev/building/python).
diff --git a/_duckdb-stubs/__init__.pyi b/_duckdb-stubs/__init__.pyi
@@ -721,6 +721,7 @@ class DuckDBPyRelation:
         write_partition_columns: bool | None = None,
         append: bool | None = None,
         filename_pattern: str | None = None,
+        file_size_bytes: str | int | None = None,
     ) -> None: ...
     def to_table(self, table_name: str) -> None: ...
     def to_view(self, view_name: str, replace: bool = True) -> DuckDBPyRelation: ...
@@ -774,6 +775,7 @@ class DuckDBPyRelation:
         write_partition_columns: bool | None = None,
         append: bool | None = None,
         filename_pattern: str | None = None,
+        file_size_bytes: str | int | None = None,
     ) -> None: ...
     @property
     def alias(self) -> str: ...
diff --git a/src/duckdb_py/include/duckdb_python/pyrelation.hpp b/src/duckdb_py/include/duckdb_python/pyrelation.hpp
@@ -214,7 +214,8 @@ struct DuckDBPyRelation {
 	               const py::object &row_group_size = py::none(), const py::object &overwrite = py::none(),
 	               const py::object &per_thread_output = py::none(), const py::object &use_tmp_file = py::none(),
 	               const py::object &partition_by = py::none(), const py::object &write_partition_columns = py::none(),
-	               const py::object &append = py::none(), const py::object &filename_pattern = py::none());
+	               const py::object &append = py::none(), const py::object &filename_pattern = py::none(),
+	               const py::object &file_size_bytes = py::none());
 
 	void ToCSV(const string &filename, const py::object &sep = py::none(), const py::object &na_rep = py::none(),
 	           const py::object &header = py::none(), const py::object &quotechar = py::none(),
diff --git a/src/duckdb_py/pyrelation.cpp b/src/duckdb_py/pyrelation.cpp
@@ -1214,7 +1214,7 @@ void DuckDBPyRelation::ToParquet(const string &filename, const py::object &compr
                                  const py::object &overwrite, const py::object &per_thread_output,
                                  const py::object &use_tmp_file, const py::object &partition_by,
                                  const py::object &write_partition_columns, const py::object &append,
-                                 const py::object &filename_pattern) {
+                                 const py::object &filename_pattern, const py::object &file_size_bytes) {
 	case_insensitive_map_t<vector<Value>> options;
 
 	if (!py::none().is(compression)) {
@@ -1312,6 +1312,17 @@ void DuckDBPyRelation::ToParquet(const string &filename, const py::object &compr
 		options["filename_pattern"] = {Value(py::str(filename_pattern))};
 	}
 
+	if (!py::none().is(file_size_bytes)) {
+		if (py::isinstance<py::int_>(file_size_bytes)) {
+			int64_t file_size_bytes_int = py::int_(file_size_bytes);
+			options["file_size_bytes"] = {Value(file_size_bytes_int)};
+		} else if (py::isinstance<py::str>(file_size_bytes)) {
+			options["file_size_bytes"] = {Value(py::str(file_size_bytes))};
+		} else {
+			throw InvalidInputException("to_parquet only accepts 'file_size_bytes' as an integer or string");
+		}
+	}
+
 	auto write_parquet = rel->WriteParquetRel(filename, std::move(options));
 	PyExecuteRelation(write_parquet);
 }
diff --git a/src/duckdb_py/pyrelation/initialize.cpp b/src/duckdb_py/pyrelation/initialize.cpp
@@ -37,7 +37,7 @@ static void InitializeConsumers(py::class_<DuckDBPyRelation> &m) {
 	             py::arg("overwrite") = py::none(), py::arg("per_thread_output") = py::none(),
 	             py::arg("use_tmp_file") = py::none(), py::arg("partition_by") = py::none(),
 	             py::arg("write_partition_columns") = py::none(), py::arg("append") = py::none(),
-	             py::arg("filename_pattern") = py::none());
+	             py::arg("filename_pattern") = py::none(), py::arg("file_size_bytes") = py::none());
 
 	DefineMethod(
 	    {"to_csv", "write_csv"}, m, &DuckDBPyRelation::ToCSV, "Write the relation object to a CSV file in 'file_name'",
diff --git a/tests/fast/api/test_to_parquet.py b/tests/fast/api/test_to_parquet.py
@@ -225,3 +225,37 @@ def test_filename_pattern_with_uuid(self, pd):
         result = duckdb.sql(f"FROM read_parquet('{temp_file_name}/*/*.parquet', hive_partitioning=TRUE)")
         expected = [("rei", 321.0, "a"), ("shinji", 123.0, "a"), ("asuka", 23.0, "b"), ("kaworu", 340.0, "c")]
         assert result.execute().fetchall() == expected
+
+    @pytest.mark.parametrize("file_size_bytes", [1000, "1k"])
+    def test_file_size_bytes_basic(self, file_size_bytes):
+        temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names()))  # noqa: PTH118
+
+        # use same test data as external/duckdb/test/sql/copy/file_size_bytes.test
+        rel = duckdb.from_query("SELECT i AS col_a, i AS col_b FROM range(0,10000) tbl(i);")
+        rel.to_parquet(temp_file_name, file_size_bytes=file_size_bytes, row_group_size=2000)
+
+        # Check that multiple files were created
+        files = list(pathlib.Path(temp_file_name).iterdir())
+        assert len(files) > 1, f"Expected multiple files, got {len(files)}"
+
+        # Verify data integrity
+        result = duckdb.read_parquet(f"{temp_file_name}/*.parquet")
+        assert len(result.execute().fetchall()) == 10000
+
+    @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()])
+    @pytest.mark.parametrize("file_size_bytes", ["256MB", "1G"])
+    def test_file_size_bytes_human_readable(self, pd, file_size_bytes):
+        temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names()))  # noqa: PTH118
+        df = pd.DataFrame(
+            {
+                "name": ["rei", "shinji", "asuka", "kaworu"],
+                "float": [321.0, 123.0, 23.0, 340.0],
+                "category": ["a", "a", "b", "c"],
+            }
+        )
+        rel = duckdb.from_df(df)
+        rel.to_parquet(temp_file_name, file_size_bytes=file_size_bytes)
+
+        # With large file size limits, should create just one file
+        parquet_rel = duckdb.read_parquet(temp_file_name)
+        assert rel.execute().fetchall() == parquet_rel.execute().fetchall()