Skip to content

Commit 7a70392

Browse files
[Dev] Move PyArrow filter pushdown to separate file (duckdb#51)
This has annoyed me for some time, needing to remember the filter pushdown logic is defined in `arrow_array_stream.cpp` which is completely illogical Cleaned it up, added a Makefile for `make format-main`, and extended the import cache to get rid of some raw `py::_module::import` calls
2 parents b348aa6 + 94dbf24 commit 7a70392

File tree

15 files changed

+526
-372
lines changed

15 files changed

+526
-372
lines changed

scripts/cache_data.json

Lines changed: 85 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,19 @@
77
"pyarrow.dataset",
88
"pyarrow.Table",
99
"pyarrow.RecordBatchReader",
10-
"pyarrow.ipc"
10+
"pyarrow.ipc",
11+
"pyarrow.scalar",
12+
"pyarrow.date32",
13+
"pyarrow.time64",
14+
"pyarrow.timestamp",
15+
"pyarrow.uint8",
16+
"pyarrow.uint16",
17+
"pyarrow.uint32",
18+
"pyarrow.uint64",
19+
"pyarrow.binary_view",
20+
"pyarrow.decimal32",
21+
"pyarrow.decimal64",
22+
"pyarrow.decimal128"
1123
]
1224
},
1325
"pyarrow.dataset": {
@@ -709,5 +721,77 @@
709721
"name": "duckdb_source",
710722
"children": [],
711723
"required": false
724+
},
725+
"pyarrow.scalar": {
726+
"type": "attribute",
727+
"full_path": "pyarrow.scalar",
728+
"name": "scalar",
729+
"children": []
730+
},
731+
"pyarrow.date32": {
732+
"type": "attribute",
733+
"full_path": "pyarrow.date32",
734+
"name": "date32",
735+
"children": []
736+
},
737+
"pyarrow.time64": {
738+
"type": "attribute",
739+
"full_path": "pyarrow.time64",
740+
"name": "time64",
741+
"children": []
742+
},
743+
"pyarrow.timestamp": {
744+
"type": "attribute",
745+
"full_path": "pyarrow.timestamp",
746+
"name": "timestamp",
747+
"children": []
748+
},
749+
"pyarrow.uint8": {
750+
"type": "attribute",
751+
"full_path": "pyarrow.uint8",
752+
"name": "uint8",
753+
"children": []
754+
},
755+
"pyarrow.uint16": {
756+
"type": "attribute",
757+
"full_path": "pyarrow.uint16",
758+
"name": "uint16",
759+
"children": []
760+
},
761+
"pyarrow.uint32": {
762+
"type": "attribute",
763+
"full_path": "pyarrow.uint32",
764+
"name": "uint32",
765+
"children": []
766+
},
767+
"pyarrow.uint64": {
768+
"type": "attribute",
769+
"full_path": "pyarrow.uint64",
770+
"name": "uint64",
771+
"children": []
772+
},
773+
"pyarrow.binary_view": {
774+
"type": "attribute",
775+
"full_path": "pyarrow.binary_view",
776+
"name": "binary_view",
777+
"children": []
778+
},
779+
"pyarrow.decimal32": {
780+
"type": "attribute",
781+
"full_path": "pyarrow.decimal32",
782+
"name": "decimal32",
783+
"children": []
784+
},
785+
"pyarrow.decimal64": {
786+
"type": "attribute",
787+
"full_path": "pyarrow.decimal64",
788+
"name": "decimal64",
789+
"children": []
790+
},
791+
"pyarrow.decimal128": {
792+
"type": "attribute",
793+
"full_path": "pyarrow.decimal128",
794+
"name": "decimal128",
795+
"children": []
712796
}
713797
}

scripts/generate_import_cache_cpp.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ def to_string(self):
182182

183183
for file in files:
184184
content = file.to_string()
185-
path = f'src/include/duckdb_python/import_cache/modules/{file.file_name}'
185+
path = f'src/duckdb_py/include/duckdb_python/import_cache/modules/{file.file_name}'
186186
import_cache_path = os.path.join(script_dir, '..', path)
187187
with open(import_cache_path, "w") as f:
188188
f.write(content)
@@ -237,7 +237,9 @@ def get_root_modules(files: List[ModuleFile]):
237237
238238
"""
239239

240-
import_cache_path = os.path.join(script_dir, '..', 'src/include/duckdb_python/import_cache/python_import_cache.hpp')
240+
import_cache_path = os.path.join(
241+
script_dir, '..', 'src/duckdb_py/include/duckdb_python/import_cache/python_import_cache.hpp'
242+
)
241243
with open(import_cache_path, "w") as f:
242244
f.write(import_cache_file)
243245

@@ -252,7 +254,7 @@ def get_module_file_path_includes(files: List[ModuleFile]):
252254
module_includes = get_module_file_path_includes(files)
253255

254256
modules_header = os.path.join(
255-
script_dir, '..', 'src/include/duckdb_python/import_cache/python_import_cache_modules.hpp'
257+
script_dir, '..', 'src/duckdb_py/include/duckdb_python/import_cache/python_import_cache_modules.hpp'
256258
)
257259
with open(modules_header, "w") as f:
258260
f.write(module_includes)

scripts/generate_import_cache_json.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,6 @@ def update_json(existing: dict, new: dict) -> dict:
170170

171171
# If both values are dictionaries, update recursively.
172172
if isinstance(new_value, dict) and isinstance(old_value, dict):
173-
print(key)
174173
updated = update_json(old_value, new_value)
175174
existing[key] = updated
176175
else:

scripts/imports.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,22 @@
66
pyarrow.Table
77
pyarrow.RecordBatchReader
88
pyarrow.ipc.MessageReader
9+
pyarrow.scalar
10+
pyarrow.date32
11+
pyarrow.time64
12+
pyarrow.timestamp
13+
pyarrow.timestamp
14+
pyarrow.timestamp
15+
pyarrow.timestamp
16+
pyarrow.timestamp
17+
pyarrow.uint8
18+
pyarrow.uint16
19+
pyarrow.uint32
20+
pyarrow.uint64
21+
pyarrow.binary_view
22+
pyarrow.decimal32
23+
pyarrow.decimal64
24+
pyarrow.decimal128
925

1026
import pandas
1127

src/duckdb_py/arrow/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# this is used for clang-tidy checks
2-
add_library(python_arrow OBJECT arrow_array_stream.cpp arrow_export_utils.cpp)
2+
add_library(python_arrow OBJECT arrow_array_stream.cpp arrow_export_utils.cpp
3+
pyarrow_filter_pushdown.cpp)
34

45
target_link_libraries(python_arrow PRIVATE _duckdb_dependencies)

0 commit comments

Comments
 (0)