Skip to content

Commit 7a0a181

Browse files
Add filename_pattern to to_parquet Python API (#201)
Closes #178
2 parents 8944767 + 5cb2afb commit 7a0a181

File tree

5 files changed

+69
-3
lines changed

5 files changed

+69
-3
lines changed

_duckdb-stubs/__init__.pyi

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -720,6 +720,7 @@ class DuckDBPyRelation:
720720
partition_by: pytyping.List[str] | None = None,
721721
write_partition_columns: bool | None = None,
722722
append: bool | None = None,
723+
filename_pattern: str | None = None,
723724
) -> None: ...
724725
def to_table(self, table_name: str) -> None: ...
725726
def to_view(self, view_name: str, replace: bool = True) -> DuckDBPyRelation: ...
@@ -772,6 +773,7 @@ class DuckDBPyRelation:
772773
partition_by: pytyping.List[str] | None = None,
773774
write_partition_columns: bool | None = None,
774775
append: bool | None = None,
776+
filename_pattern: str | None = None,
775777
) -> None: ...
776778
@property
777779
def alias(self) -> str: ...

src/duckdb_py/include/duckdb_python/pyrelation.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ struct DuckDBPyRelation {
214214
const py::object &row_group_size = py::none(), const py::object &overwrite = py::none(),
215215
const py::object &per_thread_output = py::none(), const py::object &use_tmp_file = py::none(),
216216
const py::object &partition_by = py::none(), const py::object &write_partition_columns = py::none(),
217-
const py::object &append = py::none());
217+
const py::object &append = py::none(), const py::object &filename_pattern = py::none());
218218

219219
void ToCSV(const string &filename, const py::object &sep = py::none(), const py::object &na_rep = py::none(),
220220
const py::object &header = py::none(), const py::object &quotechar = py::none(),

src/duckdb_py/pyrelation.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1213,7 +1213,8 @@ void DuckDBPyRelation::ToParquet(const string &filename, const py::object &compr
12131213
const py::object &row_group_size_bytes, const py::object &row_group_size,
12141214
const py::object &overwrite, const py::object &per_thread_output,
12151215
const py::object &use_tmp_file, const py::object &partition_by,
1216-
const py::object &write_partition_columns, const py::object &append) {
1216+
const py::object &write_partition_columns, const py::object &append,
1217+
const py::object &filename_pattern) {
12171218
case_insensitive_map_t<vector<Value>> options;
12181219

12191220
if (!py::none().is(compression)) {
@@ -1304,6 +1305,13 @@ void DuckDBPyRelation::ToParquet(const string &filename, const py::object &compr
13041305
options["use_tmp_file"] = {Value::BOOLEAN(py::bool_(use_tmp_file))};
13051306
}
13061307

1308+
if (!py::none().is(filename_pattern)) {
1309+
if (!py::isinstance<py::str>(filename_pattern)) {
1310+
throw InvalidInputException("to_parquet only accepts 'filename_pattern' as a string");
1311+
}
1312+
options["filename_pattern"] = {Value(py::str(filename_pattern))};
1313+
}
1314+
13071315
auto write_parquet = rel->WriteParquetRel(filename, std::move(options));
13081316
PyExecuteRelation(write_parquet);
13091317
}

src/duckdb_py/pyrelation/initialize.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ static void InitializeConsumers(py::class_<DuckDBPyRelation> &m) {
3636
py::arg("row_group_size_bytes") = py::none(), py::arg("row_group_size") = py::none(),
3737
py::arg("overwrite") = py::none(), py::arg("per_thread_output") = py::none(),
3838
py::arg("use_tmp_file") = py::none(), py::arg("partition_by") = py::none(),
39-
py::arg("write_partition_columns") = py::none(), py::arg("append") = py::none());
39+
py::arg("write_partition_columns") = py::none(), py::arg("append") = py::none(),
40+
py::arg("filename_pattern") = py::none());
4041

4142
DefineMethod(
4243
{"to_csv", "write_csv"}, m, &DuckDBPyRelation::ToCSV, "Write the relation object to a CSV file in 'file_name'",

tests/fast/api/test_to_parquet.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import os
2+
import pathlib
3+
import re
24
import tempfile
35

46
import pytest
@@ -170,3 +172,56 @@ def test_append(self, pd):
170172
("shinji", 123.0, "a"),
171173
]
172174
assert result.execute().fetchall() == expected
175+
176+
@pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()])
177+
def test_filename_pattern_with_index(self, pd):
178+
temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118
179+
df = pd.DataFrame(
180+
{
181+
"name": ["rei", "shinji", "asuka", "kaworu"],
182+
"float": [321.0, 123.0, 23.0, 340.0],
183+
"category": ["a", "a", "b", "c"],
184+
}
185+
)
186+
rel = duckdb.from_df(df)
187+
rel.to_parquet(temp_file_name, partition_by=["category"], filename_pattern="orders_{i}")
188+
# Check that files follow the pattern with {i}
189+
files_a = list(pathlib.Path(f"{temp_file_name}/category=a").iterdir())
190+
files_b = list(pathlib.Path(f"{temp_file_name}/category=b").iterdir())
191+
files_c = list(pathlib.Path(f"{temp_file_name}/category=c").iterdir())
192+
filename_pattern = re.compile(r"^orders_[09]+\.parquet$")
193+
assert all(filename_pattern.search(str(f.name)) for f in files_a)
194+
assert all(filename_pattern.search(str(f.name)) for f in files_b)
195+
assert all(filename_pattern.search(str(f.name)) for f in files_c)
196+
197+
# Verify data integrity
198+
result = duckdb.sql(f"FROM read_parquet('{temp_file_name}/*/*.parquet', hive_partitioning=TRUE)")
199+
expected = [("rei", 321.0, "a"), ("shinji", 123.0, "a"), ("asuka", 23.0, "b"), ("kaworu", 340.0, "c")]
200+
assert result.execute().fetchall() == expected
201+
202+
@pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()])
203+
def test_filename_pattern_with_uuid(self, pd):
204+
temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118
205+
df = pd.DataFrame(
206+
{
207+
"name": ["rei", "shinji", "asuka", "kaworu"],
208+
"float": [321.0, 123.0, 23.0, 340.0],
209+
"category": ["a", "a", "b", "c"],
210+
}
211+
)
212+
rel = duckdb.from_df(df)
213+
rel.to_parquet(temp_file_name, partition_by=["category"], filename_pattern="file_{uuid}")
214+
# Check that files follow the pattern with {uuid}
215+
files_a = list(pathlib.Path(f"{temp_file_name}/category=a").iterdir())
216+
files_b = list(pathlib.Path(f"{temp_file_name}/category=b").iterdir())
217+
files_c = list(pathlib.Path(f"{temp_file_name}/category=c").iterdir())
218+
filename_pattern = re.compile(r"^file_[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12}\.parquet$")
219+
print(files_a)
220+
assert all(filename_pattern.search(str(f.name)) for f in files_a)
221+
assert all(filename_pattern.search(str(f.name)) for f in files_b)
222+
assert all(filename_pattern.search(str(f.name)) for f in files_c)
223+
224+
# Verify data integrity
225+
result = duckdb.sql(f"FROM read_parquet('{temp_file_name}/*/*.parquet', hive_partitioning=TRUE)")
226+
expected = [("rei", 321.0, "a"), ("shinji", 123.0, "a"), ("asuka", 23.0, "b"), ("kaworu", 340.0, "c")]
227+
assert result.execute().fetchall() == expected

0 commit comments

Comments
 (0)