Skip to content

Commit 1710744

Browse files
authored
chore: rudimentary Python benchmarks & support more Polars types (#4442)
I had to patch a hole in Polars type conversion as well. I do not understand why duckdb is so slow. Maybe Arrow conversion is very expensive? Ideally there should be no gap between `test_scan_XXX` and each engine's XXX benchmark because all of these queries are simple O(N) scans. ``` ------------------------------------------------------------------------------------------------ benchmark: 15 tests ------------------------------------------------------------------------------------------------- Name (time in us) Min Max Mean StdDev Median IQR Outliers OPS Rounds Iterations ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- test_repeated_scan_scalar_at 1.0829 (1.0) 12.7910 (1.0) 1.9084 (1.0) 0.5208 (1.0) 1.7919 (1.0) 0.6661 (1.0) 8534;503 523,994.6207 (1.0) 36811 1 test_scan_scalar_at 32.4170 (29.94) 212.0000 (16.57) 40.4745 (21.21) 4.1199 (7.91) 40.1658 (22.42) 4.1660 (6.25) 1217;209 24,706.9329 (0.05) 7470 1 test_repeated_scan 122.7499 (113.35) 237.9999 (18.61) 140.3071 (73.52) 5.5270 (10.61) 141.7079 (79.08) 2.9160 (4.38) 445;426 7,127.2225 (0.01) 2293 1 test_scan 148.0419 (136.71) 555.2911 (43.41) 172.1939 (90.23) 30.2790 (58.14) 166.6665 (93.01) 7.2501 (10.88) 22;81 5,807.4083 (0.01) 810 1 test_polars_scalar_at 161.6669 (149.29) 451.8752 (35.33) 216.6078 (113.50) 23.6843 (45.48) 214.8750 (119.92) 24.1671 (36.28) 811;107 4,616.6396 (0.01) 3865 1 test_duckdb_scalar_at 183.4580 (169.41) 1,157.4170 (90.49) 266.8048 (139.80) 56.7218 (108.92) 258.9581 (144.52) 64.0000 (96.08) 556;120 3,748.0582 (0.01) 2596 1 test_polars 188.6250 (174.19) 497.0001 (38.86) 252.7230 (132.43) 43.1075 (82.78) 242.3751 (135.26) 33.5106 (50.31) 193;100 3,956.9012 (0.01) 1277 1 test_polars_streaming_scalar_at 213.6659 (197.31) 822.5001 (64.30) 282.8750 (148.22) 38.8552 (74.61) 275.7079 (153.87) 25.8966 (38.88) 366;230 3,535.1300 (0.01) 3096 1 test_scan_filter 360.1250 (332.56) 527.9169 (41.27) 408.9275 (214.28) 26.6184 (51.11) 403.7919 (225.35) 28.4789 (42.75) 263;68 2,445.4213 (0.00) 1109 1 test_repeated_scan_filter 378.2089 (349.26) 475.5841 (37.18) 410.9510 (215.34) 16.5570 (31.79) 412.9999 (230.49) 10.5419 (15.83) 484;450 2,433.3800 (0.00) 1709 1 test_polars_filter 397.4999 (367.07) 957.6660 (74.87) 479.0831 (251.04) 43.1060 (82.77) 468.7500 (261.60) 40.5819 (60.92) 182;40 2,087.3206 (0.00) 1006 1 test_polars_streaming 435.4171 (402.09) 1,088.7499 (85.12) 556.2495 (291.47) 51.5738 (99.03) 548.3751 (306.04) 38.2287 (57.39) 79;43 1,797.7544 (0.00) 571 1 test_polars_streaming_filter 621.7081 (574.12) 1,215.5001 (95.03) 712.6453 (373.42) 55.5044 (106.58) 699.2079 (390.21) 68.1449 (102.30) 217;14 1,403.2226 (0.00) 1001 1 test_duckdb_filter 909.5829 (839.95) 2,197.5001 (171.80) 1,169.5916 (612.86) 148.8653 (285.86) 1,147.1671 (640.21) 220.3751 (330.83) 181;2 854.9993 (0.00) 563 1 test_duckdb 920.4999 (850.04) 1,703.5829 (133.19) 1,262.7979 (661.70) 182.6230 (350.68) 1,228.4580 (685.58) 335.4165 (503.53) 266;0 791.8923 (0.00) 644 1 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ``` Signed-off-by: Daniel King <[email protected]>
1 parent dbec716 commit 1710744

File tree

11 files changed

+234
-1
lines changed

11 files changed

+234
-1
lines changed

.github/workflows/ci.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,11 @@ jobs:
8080
uv run --all-packages pytest --benchmark-disable test/
8181
working-directory: vortex-python/
8282

83+
- name: Pytest Benchmarks - Vortex
84+
run: |
85+
uv run --all-packages pytest --benchmark-only benchmark/
86+
working-directory: vortex-python/
87+
8388
- name: Doctest - PyVortex
8489
run: |
8590
uv run --all-packages make doctest

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ dev-dependencies = [
3333
"pytest>=7.4.0",
3434
"ruff>=0.7.1",
3535
"ray>=2.48",
36+
"pytest-benchmark>=5.1.0",
3637
]
3738

3839
[tool.uv.workspace]

uv.lock

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright the Vortex contributors
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
import hashlib
5+
import math
6+
import os
7+
8+
import pyarrow as pa
9+
import pytest
10+
11+
import vortex as vx
12+
13+
14+
@pytest.fixture(
15+
scope="session",
16+
params=[{"x"}, {"x", "y"}, {"x", "z"}, {"x", "y", "z"}],
17+
ids=["int", "int_str", "int_float", "int_str_float"],
18+
)
19+
def vxf(tmpdir_factory: pytest.TempPathFactory, request: pytest.FixtureRequest) -> vx.VortexFile:
20+
fname = tmpdir_factory.mktemp("data") / "foo.vortex"
21+
22+
if not os.path.exists(fname):
23+
length = 100_000
24+
25+
columns: dict[str, list[int] | list[float] | list[str]] = {}
26+
assert "x" in request.param # pyright: ignore[reportAny]
27+
columns["x"] = list(range(length))
28+
29+
if "y" in request.param: # pyright: ignore[reportAny]
30+
columns["y"] = [hashlib.md5(x.to_bytes(length=4), usedforsecurity=False).hexdigest() for x in range(length)]
31+
if "z" in request.param: # pyright: ignore[reportAny]
32+
columns["z"] = [math.sqrt(x) for x in range(length)]
33+
34+
a = vx.array(pa.table(columns)) # pyright: ignore[reportCallIssue, reportUnknownArgumentType, reportArgumentType]
35+
vx.io.write(a, str(fname))
36+
return vx.open(str(fname))
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
from typing import Literal
5+
6+
import duckdb
7+
import pyarrow as pa
8+
import pytest
9+
from pyarrow.types import is_floating, is_integer
10+
from pytest_benchmark.fixture import BenchmarkFixture # pyright: ignore[reportMissingTypeStubs]
11+
12+
import vortex as vx
13+
14+
15+
def _has_mean(t: pa.DataType) -> bool:
16+
return is_integer(t) or is_floating(t)
17+
18+
19+
@pytest.mark.benchmark(group="aggregation", disable_gc=True)
20+
def test_arrow_table_aggregation(benchmark: BenchmarkFixture, vxf: vx.VortexFile):
21+
aggregations: list[tuple[str, Literal["mean"]]] = [
22+
(field.name, "mean")
23+
for field in vxf.dtype.to_arrow_schema() # pyright: ignore[reportUnknownVariableType]
24+
if _has_mean(field.type) # pyright: ignore[reportUnknownMemberType, reportUnknownArgumentType]
25+
]
26+
benchmark(lambda: pa.concat_tables(x.to_arrow_table() for x in vxf.scan()).group_by([]).aggregate(aggregations))
27+
28+
29+
@pytest.mark.benchmark(group="aggregation", disable_gc=True)
30+
def test_polars_aggregation(benchmark: BenchmarkFixture, vxf: vx.VortexFile):
31+
lf = vxf.to_polars()
32+
benchmark(lambda: lf.mean().collect().to_arrow())
33+
34+
35+
@pytest.mark.benchmark(group="aggregation", disable_gc=True)
36+
def test_polars_streaming_aggregation(benchmark: BenchmarkFixture, vxf: vx.VortexFile):
37+
lf = vxf.to_polars()
38+
benchmark(lambda: lf.mean().collect(engine="streaming").to_arrow())
39+
40+
41+
@pytest.mark.benchmark(group="aggregation", disable_gc=True)
42+
def test_duckdb_aggregation(benchmark: BenchmarkFixture, vxf: vx.VortexFile):
43+
conn = duckdb.connect(database=":memory:") # pyright: ignore[reportUnknownMemberType]
44+
ds = vxf.to_dataset()
45+
_ = conn.register("ds", ds)
46+
aggregations = ",".join(
47+
[f"avg(ds.{field.name}) as {field.name}" for field in vxf.dtype.to_arrow_schema() if _has_mean(field.type)] # pyright: ignore[reportUnknownVariableType, reportUnknownMemberType, reportUnknownArgumentType]
48+
)
49+
print(aggregations)
50+
query = f"select {aggregations} from ds"
51+
benchmark(lambda: conn.sql(query).to_arrow_table())
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
import duckdb
5+
import polars as pl
6+
import pyarrow as pa
7+
import pytest
8+
from pytest_benchmark.fixture import BenchmarkFixture # pyright: ignore[reportMissingTypeStubs]
9+
10+
import vortex as vx
11+
from vortex.expr import column
12+
13+
14+
@pytest.mark.benchmark(group="filter", disable_gc=True)
15+
def test_scan_filter(benchmark: BenchmarkFixture, vxf: vx.VortexFile):
16+
benchmark(lambda: pa.concat_tables(x.to_arrow_table() for x in vxf.scan(expr=column("x") >= 50_000)))
17+
18+
19+
@pytest.mark.benchmark(group="filter", disable_gc=True)
20+
def test_repeated_scan_filter(benchmark: BenchmarkFixture, vxf: vx.VortexFile):
21+
rscan = vxf.to_repeated_scan(expr=column("x") > 50_000)
22+
benchmark(lambda: pa.concat_tables(x.to_arrow_table() for x in rscan.execute()))
23+
24+
25+
@pytest.mark.benchmark(group="filter", disable_gc=True)
26+
def test_polars_filter(benchmark: BenchmarkFixture, vxf: vx.VortexFile):
27+
lf = vxf.to_polars()
28+
benchmark(lambda: lf.filter(pl.col("x") >= pl.lit(50_000).cast(pl.Int64)).collect().to_arrow())
29+
30+
31+
@pytest.mark.benchmark(group="filter", disable_gc=True)
32+
def test_polars_streaming_filter(benchmark: BenchmarkFixture, vxf: vx.VortexFile):
33+
lf = vxf.to_polars()
34+
benchmark(lambda: lf.filter(pl.col("x") >= pl.lit(50_000).cast(pl.Int64)).collect(engine="streaming").to_arrow())
35+
36+
37+
@pytest.mark.benchmark(group="filter", disable_gc=True)
38+
def test_duckdb_filter(benchmark: BenchmarkFixture, vxf: vx.VortexFile):
39+
conn = duckdb.connect(database=":memory:") # pyright: ignore[reportUnknownMemberType]
40+
ds = vxf.to_dataset()
41+
_ = conn.register("ds", ds)
42+
benchmark(lambda: conn.sql("select ds.x from ds where x >= 50000").to_arrow_table())
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
import duckdb
5+
import pyarrow as pa
6+
import pytest
7+
from pytest_benchmark.fixture import BenchmarkFixture # pyright: ignore[reportMissingTypeStubs]
8+
9+
import vortex as vx
10+
11+
12+
@pytest.mark.benchmark(group="scalar_at", disable_gc=True)
13+
def test_scan_scalar_at(benchmark: BenchmarkFixture, vxf: vx.VortexFile):
14+
benchmark(lambda: pa.concat_tables(x.to_arrow_table() for x in vxf.scan(indices=vx.array([50_000]))))
15+
16+
17+
@pytest.mark.benchmark(group="scalar_at", disable_gc=True)
18+
def test_repeated_scan_scalar_at(benchmark: BenchmarkFixture, vxf: vx.VortexFile):
19+
rscan = vxf.to_repeated_scan()
20+
benchmark(lambda: rscan.scalar_at(50_000))
21+
22+
23+
@pytest.mark.benchmark(group="scalar_at", disable_gc=True)
24+
def test_polars_scalar_at(benchmark: BenchmarkFixture, vxf: vx.VortexFile):
25+
lf = vxf.to_polars()
26+
benchmark(lambda: lf.slice(50_000, 50_001).collect().to_arrow())
27+
28+
29+
@pytest.mark.benchmark(group="scalar_at", disable_gc=True)
30+
def test_polars_streaming_scalar_at(benchmark: BenchmarkFixture, vxf: vx.VortexFile):
31+
lf = vxf.to_polars()
32+
benchmark(lambda: lf.slice(50_000, 50_001).collect(engine="streaming").to_arrow())
33+
34+
35+
@pytest.mark.benchmark(group="scalar_at", disable_gc=True)
36+
def test_duckdb_scalar_at(benchmark: BenchmarkFixture, vxf: vx.VortexFile):
37+
conn = duckdb.connect(database=":memory:") # pyright: ignore[reportUnknownMemberType]
38+
ds = vxf.to_dataset()
39+
_ = conn.register("ds", ds)
40+
benchmark(lambda: conn.sql("select ds.x from ds offset 50000 limit 1").to_arrow_table())
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
import duckdb
5+
import pyarrow as pa
6+
import pytest
7+
from pytest_benchmark.fixture import BenchmarkFixture # pyright: ignore[reportMissingTypeStubs]
8+
9+
import vortex as vx
10+
11+
12+
@pytest.mark.benchmark(group="scan", disable_gc=True)
13+
def test_scan(benchmark: BenchmarkFixture, vxf: vx.VortexFile):
14+
benchmark(lambda: pa.concat_tables(x.to_arrow_table() for x in vxf.scan()))
15+
16+
17+
@pytest.mark.benchmark(group="scan", disable_gc=True)
18+
def test_repeated_scan(benchmark: BenchmarkFixture, vxf: vx.VortexFile):
19+
rscan = vxf.to_repeated_scan()
20+
benchmark(lambda: pa.concat_tables(x.to_arrow_table() for x in rscan.execute()))
21+
22+
23+
@pytest.mark.benchmark(group="scan", disable_gc=True)
24+
def test_polars(benchmark: BenchmarkFixture, vxf: vx.VortexFile):
25+
lf = vxf.to_polars()
26+
benchmark(lambda: lf.collect().to_arrow())
27+
28+
29+
@pytest.mark.benchmark(group="scan", disable_gc=True)
30+
def test_polars_streaming(benchmark: BenchmarkFixture, vxf: vx.VortexFile):
31+
lf = vxf.to_polars()
32+
benchmark(lambda: lf.collect(engine="streaming").to_arrow())
33+
34+
35+
@pytest.mark.benchmark(group="scan", disable_gc=True)
36+
def test_duckdb(benchmark: BenchmarkFixture, vxf: vx.VortexFile):
37+
conn = duckdb.connect(database=":memory:") # pyright: ignore[reportUnknownMemberType]
38+
ds = vxf.to_dataset()
39+
_ = conn.register("ds", ds)
40+
benchmark(lambda: conn.sql("select ds.x from ds").to_arrow_table())

vortex-python/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,4 +72,5 @@ dev = [
7272
"pandas-stubs>=2.2.3.241126",
7373
"pcodec>=0.3.3",
7474
"pyarrow-stubs>=17.16",
75+
"pytest-benchmark>=5.1.0",
7576
]

0 commit comments

Comments
 (0)