|
1 | 1 | import os |
| 2 | +import pathlib |
| 3 | +import re |
2 | 4 | import tempfile |
3 | 5 |
|
4 | 6 | import pytest |
@@ -170,3 +172,56 @@ def test_append(self, pd): |
170 | 172 | ("shinji", 123.0, "a"), |
171 | 173 | ] |
172 | 174 | assert result.execute().fetchall() == expected |
| 175 | + |
| 176 | + @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()]) |
| 177 | + def test_filename_pattern_with_index(self, pd): |
| 178 | + temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 |
| 179 | + df = pd.DataFrame( |
| 180 | + { |
| 181 | + "name": ["rei", "shinji", "asuka", "kaworu"], |
| 182 | + "float": [321.0, 123.0, 23.0, 340.0], |
| 183 | + "category": ["a", "a", "b", "c"], |
| 184 | + } |
| 185 | + ) |
| 186 | + rel = duckdb.from_df(df) |
| 187 | + rel.to_parquet(temp_file_name, partition_by=["category"], filename_pattern="orders_{i}") |
| 188 | + # Check that files follow the pattern with {i} |
| 189 | + files_a = list(pathlib.Path(f"{temp_file_name}/category=a").iterdir()) |
| 190 | + files_b = list(pathlib.Path(f"{temp_file_name}/category=b").iterdir()) |
| 191 | + files_c = list(pathlib.Path(f"{temp_file_name}/category=c").iterdir()) |
| 192 | + filename_pattern = re.compile(r"^orders_[09]+\.parquet$") |
| 193 | + assert all(filename_pattern.search(str(f.name)) for f in files_a) |
| 194 | + assert all(filename_pattern.search(str(f.name)) for f in files_b) |
| 195 | + assert all(filename_pattern.search(str(f.name)) for f in files_c) |
| 196 | + |
| 197 | + # Verify data integrity |
| 198 | + result = duckdb.sql(f"FROM read_parquet('{temp_file_name}/*/*.parquet', hive_partitioning=TRUE)") |
| 199 | + expected = [("rei", 321.0, "a"), ("shinji", 123.0, "a"), ("asuka", 23.0, "b"), ("kaworu", 340.0, "c")] |
| 200 | + assert result.execute().fetchall() == expected |
| 201 | + |
| 202 | + @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()]) |
| 203 | + def test_filename_pattern_with_uuid(self, pd): |
| 204 | + temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 |
| 205 | + df = pd.DataFrame( |
| 206 | + { |
| 207 | + "name": ["rei", "shinji", "asuka", "kaworu"], |
| 208 | + "float": [321.0, 123.0, 23.0, 340.0], |
| 209 | + "category": ["a", "a", "b", "c"], |
| 210 | + } |
| 211 | + ) |
| 212 | + rel = duckdb.from_df(df) |
| 213 | + rel.to_parquet(temp_file_name, partition_by=["category"], filename_pattern="file_{uuid}") |
| 214 | + # Check that files follow the pattern with {uuid} |
| 215 | + files_a = list(pathlib.Path(f"{temp_file_name}/category=a").iterdir()) |
| 216 | + files_b = list(pathlib.Path(f"{temp_file_name}/category=b").iterdir()) |
| 217 | + files_c = list(pathlib.Path(f"{temp_file_name}/category=c").iterdir()) |
| 218 | + filename_pattern = re.compile(r"^file_[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12}\.parquet$") |
| 219 | + print(files_a) |
| 220 | + assert all(filename_pattern.search(str(f.name)) for f in files_a) |
| 221 | + assert all(filename_pattern.search(str(f.name)) for f in files_b) |
| 222 | + assert all(filename_pattern.search(str(f.name)) for f in files_c) |
| 223 | + |
| 224 | + # Verify data integrity |
| 225 | + result = duckdb.sql(f"FROM read_parquet('{temp_file_name}/*/*.parquet', hive_partitioning=TRUE)") |
| 226 | + expected = [("rei", 321.0, "a"), ("shinji", 123.0, "a"), ("asuka", 23.0, "b"), ("kaworu", 340.0, "c")] |
| 227 | + assert result.execute().fetchall() == expected |
0 commit comments