Skip to content

Commit b51d550

Browse files
authored
Merge pull request #2868 from mabel-dev/clickbench-performance-regression-investigation-1
changes to parquet reader
2 parents 7f1bf82 + 5fa0256 commit b51d550

File tree

4 files changed

+16
-25
lines changed

4 files changed

+16
-25
lines changed

opteryx/__version__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
22
# DO NOT EDIT THIS FILE DIRECTLY
33

4-
__build__ = 1690
4+
__build__ = 1691
55
__author__ = "@joocer"
6-
__version__ = "0.26.0-beta.1690"
6+
__version__ = "0.26.0-beta.1691"
77

88
# Store the version here so:
99
# 1) we don't load dependencies by storing it in __init__.py

opteryx/managers/expression/ops.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -228,11 +228,7 @@ def _inner_filter_operations(arr, operator, value):
228228
matches = compute.match_like(arr, value).to_numpy(False).astype(dtype=numpy.bool_)
229229
return numpy.invert(matches)
230230
if operator == "ILike":
231-
return (
232-
compute.match_like(arr, value, ignore_case=True)
233-
.to_numpy(False)
234-
.astype(dtype=numpy.bool_)
235-
)
231+
return compute.match_like(arr, value, ignore_case=True).to_numpy(False).astype(dtype=numpy.bool_)
236232
if operator == "NotILike":
237233
matches = compute.match_like(arr, value, ignore_case=True)
238234
return numpy.invert(matches)

opteryx/utils/file_decoders.py

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -340,19 +340,15 @@ def parquet_decoder(
340340
# pyarrow.Buffer when we have a memoryview to avoid creating intermediate
341341
# Python bytes objects.
342342
if isinstance(buffer, memoryview):
343-
# pyarrow.py_buffer accepts buffer-protocol objects and is zero-copy
344-
try:
345-
pa_buf = pyarrow.py_buffer(buffer)
346-
stream = pyarrow.BufferReader(pa_buf)
347-
except Exception:
348-
# fallback to MemoryViewStream if pyarrow can't handle this memoryview
349-
stream = MemoryViewStream(buffer)
343+
pa_buf = pyarrow.py_buffer(buffer)
344+
stream = pyarrow.BufferReader(pa_buf)
350345
elif isinstance(buffer, bytes):
351346
stream = pyarrow.BufferReader(buffer)
352347
else:
353348
stream = pyarrow.input_stream(buffer)
354349

355-
parquet_file = parquet.ParquetFile(stream)
350+
pq_meta = parquet.read_metadata(stream)
351+
stream.seek(0)
356352

357353
# we need to work out if we have a selection which may force us
358354
# fetching columns just for filtering
@@ -366,7 +362,7 @@ def parquet_decoder(
366362
c.value for c in get_all_nodes_of_type(processed_selection, (NodeType.IDENTIFIER,))
367363
}
368364
selected_columns = list(
369-
projection_set.union(filter_columns).intersection(parquet_file.schema_arrow.names)
365+
projection_set.union(filter_columns).intersection(pq_meta.schema.names)
370366
)
371367

372368
# Read all columns if none are selected, unless force_read is set
@@ -376,18 +372,18 @@ def parquet_decoder(
376372
# get the full data size of the file to see how effective projection/selection is
377373
uncompressed_size = sum(
378374
row_group.column(j).total_uncompressed_size
379-
for i in range(parquet_file.metadata.num_row_groups)
380-
for row_group in [parquet_file.metadata.row_group(i)]
375+
for i in range(pq_meta.num_row_groups)
376+
for row_group in [pq_meta.row_group(i)]
381377
for j in range(row_group.num_columns)
382378
)
383379

384380
# If it's COUNT(*), we don't need to create a full dataset
385381
# We have a handler later to sum up the $COUNT(*) column
386382
if projection == [] and selection == []:
387-
table = pyarrow.Table.from_arrays([[parquet_file.metadata.num_rows]], names=["$COUNT(*)"])
383+
table = pyarrow.Table.from_arrays([[pq_meta.num_rows]], names=["$COUNT(*)"])
388384
return (
389-
parquet_file.metadata.num_rows,
390-
parquet_file.metadata.num_columns,
385+
pq_meta.num_rows,
386+
pq_meta.num_columns,
391387
uncompressed_size,
392388
table,
393389
)
@@ -400,16 +396,15 @@ def parquet_decoder(
400396
filters=dnf_filter,
401397
use_threads=use_threads,
402398
use_pandas_metadata=False,
403-
schema=parquet_file.schema_arrow,
404399
)
405400

406401
# Any filters we couldn't push to PyArrow to read we run here
407402
if processed_selection:
408403
table = filter_records(processed_selection, table)
409404

410405
return (
411-
parquet_file.metadata.num_rows,
412-
parquet_file.metadata.num_columns,
406+
pq_meta.num_rows,
407+
pq_meta.num_columns,
413408
uncompressed_size,
414409
table,
415410
)

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "opteryx"
3-
version = "0.26.0-beta.1690"
3+
version = "0.26.0-beta.1691"
44
description = "Query your data, where it lives"
55
requires-python = '>=3.11'
66
readme = {file = "README.md", content-type = "text/markdown"}

0 commit comments

Comments
 (0)