add file_size_bytes, wasn't able to run the tests locally.

nicornk · nicornk · commit 8ae7b83581e5 · 2025-12-03T15:02:10.000+01:00
diff --git a/_duckdb-stubs/__init__.pyi b/_duckdb-stubs/__init__.pyi
@@ -721,6 +721,7 @@ class DuckDBPyRelation:
         write_partition_columns: bool | None = None,
         append: bool | None = None,
         filename_pattern: str | None = None,
+        file_size_bytes: str | int | None = None,
     ) -> None: ...
     def to_table(self, table_name: str) -> None: ...
     def to_view(self, view_name: str, replace: bool = True) -> DuckDBPyRelation: ...
@@ -774,6 +775,7 @@ class DuckDBPyRelation:
         write_partition_columns: bool | None = None,
         append: bool | None = None,
         filename_pattern: str | None = None,
+        file_size_bytes: str | int | None = None,
     ) -> None: ...
     @property
     def alias(self) -> str: ...
diff --git a/src/duckdb_py/include/duckdb_python/pyrelation.hpp b/src/duckdb_py/include/duckdb_python/pyrelation.hpp
@@ -214,7 +214,8 @@ struct DuckDBPyRelation {
 	               const py::object &row_group_size = py::none(), const py::object &overwrite = py::none(),
 	               const py::object &per_thread_output = py::none(), const py::object &use_tmp_file = py::none(),
 	               const py::object &partition_by = py::none(), const py::object &write_partition_columns = py::none(),
-	               const py::object &append = py::none(), const py::object &filename_pattern = py::none());
+	               const py::object &append = py::none(), const py::object &filename_pattern = py::none(),
+	               const py::object &file_size_bytes = py::none());
 
 	void ToCSV(const string &filename, const py::object &sep = py::none(), const py::object &na_rep = py::none(),
 	           const py::object &header = py::none(), const py::object &quotechar = py::none(),
diff --git a/src/duckdb_py/pyrelation.cpp b/src/duckdb_py/pyrelation.cpp
@@ -1214,7 +1214,7 @@ void DuckDBPyRelation::ToParquet(const string &filename, const py::object &compr
                                  const py::object &overwrite, const py::object &per_thread_output,
                                  const py::object &use_tmp_file, const py::object &partition_by,
                                  const py::object &write_partition_columns, const py::object &append,
-                                 const py::object &filename_pattern) {
+                                 const py::object &filename_pattern, const py::object &file_size_bytes) {
 	case_insensitive_map_t<vector<Value>> options;
 
 	if (!py::none().is(compression)) {
@@ -1312,6 +1312,17 @@ void DuckDBPyRelation::ToParquet(const string &filename, const py::object &compr
 		options["filename_pattern"] = {Value(py::str(filename_pattern))};
 	}
 
+	if (!py::none().is(file_size_bytes)) {
+		if (py::isinstance<py::int_>(file_size_bytes)) {
+			int64_t file_size_bytes_int = py::int_(file_size_bytes);
+			options["file_size_bytes"] = {Value(file_size_bytes_int)};
+		} else if (py::isinstance<py::str>(file_size_bytes)) {
+			options["file_size_bytes"] = {Value(py::str(file_size_bytes))};
+		} else {
+			throw InvalidInputException("to_parquet only accepts 'file_size_bytes' as an integer or string");
+		}
+	}
+
 	auto write_parquet = rel->WriteParquetRel(filename, std::move(options));
 	PyExecuteRelation(write_parquet);
 }
diff --git a/src/duckdb_py/pyrelation/initialize.cpp b/src/duckdb_py/pyrelation/initialize.cpp
@@ -37,7 +37,7 @@ static void InitializeConsumers(py::class_<DuckDBPyRelation> &m) {
 	             py::arg("overwrite") = py::none(), py::arg("per_thread_output") = py::none(),
 	             py::arg("use_tmp_file") = py::none(), py::arg("partition_by") = py::none(),
 	             py::arg("write_partition_columns") = py::none(), py::arg("append") = py::none(),
-	             py::arg("filename_pattern") = py::none());
+	             py::arg("filename_pattern") = py::none(), py::arg("file_size_bytes") = py::none());
 
 	DefineMethod(
 	    {"to_csv", "write_csv"}, m, &DuckDBPyRelation::ToCSV, "Write the relation object to a CSV file in 'file_name'",
diff --git a/tests/fast/api/test_to_parquet.py b/tests/fast/api/test_to_parquet.py
@@ -225,3 +225,67 @@ def test_filename_pattern_with_uuid(self, pd):
         result = duckdb.sql(f"FROM read_parquet('{temp_file_name}/*/*.parquet', hive_partitioning=TRUE)")
         expected = [("rei", 321.0, "a"), ("shinji", 123.0, "a"), ("asuka", 23.0, "b"), ("kaworu", 340.0, "c")]
         assert result.execute().fetchall() == expected
+
+    @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()])
+    @pytest.mark.parametrize("file_size_bytes", [1000, "1k"])
+    def test_file_size_bytes_basic(self, pd, file_size_bytes):
+        temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names()))  # noqa: PTH118
+        # Create a larger dataset to ensure multiple files are created
+        df = pd.DataFrame(
+            {
+                "name": [f"name_{i}" for i in range(100)],
+                "value": [i * 100.0 for i in range(100)],
+                "description": [f"description_{i}_with_more_text" for i in range(100)],
+            }
+        )
+        rel = duckdb.from_df(df)
+        rel.to_parquet(temp_file_name, file_size_bytes=file_size_bytes, per_thread_output=True)
+
+        # Check that multiple files were created
+        files = list(pathlib.Path(temp_file_name).iterdir())
+        assert len(files) > 1, f"Expected multiple files, got {len(files)}"
+
+        # Verify data integrity
+        result = duckdb.read_parquet(f"{temp_file_name}/*.parquet")
+        assert len(result.execute().fetchall()) == 100
+
+    @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()])
+    def test_file_size_bytes_with_partition(self, pd):
+        temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names()))  # noqa: PTH118
+        # Create a dataset with enough data to trigger file splitting
+        df = pd.DataFrame(
+            {
+                "name": [f"name_{i}" for i in range(100)],
+                "value": [i * 100.0 for i in range(100)],
+                "category": ["a" if i < 50 else "b" for i in range(100)],
+                "description": [f"description_{i}_with_more_text_to_increase_size" for i in range(100)],
+            }
+        )
+        rel = duckdb.from_df(df)
+        rel.to_parquet(temp_file_name, partition_by=["category"], file_size_bytes="2k", per_thread_output=True)
+
+        # Check that files were created in partition directories
+        assert pathlib.Path(f"{temp_file_name}/category=a").exists()
+        assert pathlib.Path(f"{temp_file_name}/category=b").exists()
+
+        # Verify data integrity
+        result = duckdb.sql(f"FROM read_parquet('{temp_file_name}/*/*.parquet', hive_partitioning=TRUE)")
+        assert len(result.execute().fetchall()) == 100
+
+    @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()])
+    @pytest.mark.parametrize("file_size_bytes", ["1M", "1G"])
+    def test_file_size_bytes_human_readable(self, pd, file_size_bytes):
+        temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names()))  # noqa: PTH118
+        df = pd.DataFrame(
+            {
+                "name": ["rei", "shinji", "asuka", "kaworu"],
+                "float": [321.0, 123.0, 23.0, 340.0],
+                "category": ["a", "a", "b", "c"],
+            }
+        )
+        rel = duckdb.from_df(df)
+        rel.to_parquet(temp_file_name, file_size_bytes=file_size_bytes)
+
+        # With large file size limits, should create just one file
+        parquet_rel = duckdb.read_parquet(temp_file_name)
+        assert rel.execute().fetchall() == parquet_rel.execute().fetchall()