Skip to content

Commit e6cd9c3

Browse files
authored
Run queries in python benchmarks using only one thread (#24)
This PR adds single-threaded variants for engines for running unit-test style UDF tests using only one thread. This makes the performance benchmark results of UDF functions comparable across different engines.
1 parent 2051517 commit e6cd9c3

File tree

7 files changed

+141
-19
lines changed

7 files changed

+141
-19
lines changed

benchmarks/README.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,3 +79,34 @@ test_st_buffer[collections_simple-PostGIS] 855.3329 (9.96) 854.7194 (9.
7979
```
8080

8181
For more details and command line options, refer to the official [pytest-benchmark documentation](https://pytest-benchmark.readthedocs.io/en/latest/usage.html)
82+
83+
### Adding New Benchmarks
84+
85+
There are two types of engines, each type serving a different purpose:
86+
87+
- `SedonaDBSingleThread`, `DuckDBSingleThread`, `PostGISSingleThread`:
88+
Micro / UDF benchmarks that measure the per-function cost (e.g. ST_Area, ST_Contains). These should run engines in a comparable, single-thread style configuration (where possible) to make function-level performance differences clearer.
89+
- `SedonaDB`, `DuckDB`, `PostGIS`:
90+
Macro / complex query benchmarks (e.g. KNN joins) that represent perceived end-user performance. Engines run with their default / natural configuration (multi-threading, internal parallelism, etc.).
91+
92+
Please choose the appropriate engines when adding a new benchmark. All existing benchmarks have been annotated accordingly.
93+
94+
Example (UDF micro benchmark in single-thread mode):
95+
```python
96+
import pytest
97+
from sedonadb.testing import SedonaDBSingleThread, DuckDBSingleThread, PostGISSingleThread
98+
99+
@pytest.mark.parametrize("eng", [SedonaDBSingleThread, PostGISSingleThread, DuckDBSingleThread])
100+
def test_st_area(benchmark, eng):
101+
...
102+
```
103+
104+
Example (Query / macro benchmark in default mode):
105+
```python
106+
import pytest
107+
from sedonadb.testing import SedonaDB, DuckDB, PostGIS
108+
109+
@pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB])
110+
def test_knn_performance(benchmark, eng):
111+
...
112+
```

benchmarks/test_bench_base.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,25 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717
import json
18-
from sedonadb.testing import DuckDB, PostGIS, SedonaDB
18+
from sedonadb.testing import (
19+
DuckDB,
20+
PostGIS,
21+
SedonaDB,
22+
DuckDBSingleThread,
23+
PostGISSingleThread,
24+
SedonaDBSingleThread,
25+
)
1926

2027

2128
class TestBenchBase:
2229
def setup_class(self):
2330
self.sedonadb = SedonaDB.create_or_skip()
2431
self.postgis = PostGIS.create_or_skip()
2532
self.duckdb = DuckDB.create_or_skip()
33+
# Single-thread engine instances
34+
self.sedonadb_single = SedonaDBSingleThread.create_or_skip()
35+
self.postgis_single = PostGISSingleThread.create_or_skip()
36+
self.duckdb_single = DuckDBSingleThread.create_or_skip()
2637

2738
num_geoms = 100_000
2839

@@ -128,6 +139,10 @@ def setup_class(self):
128139
self.sedonadb.create_table_arrow(name, tab)
129140
self.postgis.create_table_arrow(name, tab)
130141
self.duckdb.create_table_arrow(name, tab)
142+
self.sedonadb_single.create_table_arrow(name, tab)
143+
self.duckdb_single.create_table_arrow(name, tab)
144+
# We don't need to call self.postgis_single.create_table_arrow
145+
# because it shares the same database with self.postgis
131146

132147
def _get_eng(self, eng):
133148
if eng == SedonaDB:
@@ -136,5 +151,11 @@ def _get_eng(self, eng):
136151
return self.postgis
137152
elif eng == DuckDB:
138153
return self.duckdb
154+
elif eng == SedonaDBSingleThread:
155+
return self.sedonadb_single
156+
elif eng == PostGISSingleThread:
157+
return self.postgis_single
158+
elif eng == DuckDBSingleThread:
159+
return self.duckdb_single
139160
else:
140161
raise ValueError(f"Unsupported engine: {eng}")

benchmarks/test_distance.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,17 @@
1616
# under the License.
1717
import pytest
1818
from test_bench_base import TestBenchBase
19-
from sedonadb.testing import DuckDB, PostGIS, SedonaDB
19+
from sedonadb.testing import (
20+
DuckDBSingleThread,
21+
PostGISSingleThread,
22+
SedonaDBSingleThread,
23+
)
2024

2125

2226
class TestBenchPredicates(TestBenchBase):
23-
@pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB])
27+
@pytest.mark.parametrize(
28+
"eng", [SedonaDBSingleThread, PostGISSingleThread, DuckDBSingleThread]
29+
)
2430
@pytest.mark.parametrize(
2531
"table",
2632
[

benchmarks/test_functions.py

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,17 @@
1616
# under the License.
1717
import pytest
1818
from test_bench_base import TestBenchBase
19-
from sedonadb.testing import DuckDB, PostGIS, SedonaDB
19+
from sedonadb.testing import (
20+
DuckDBSingleThread,
21+
SedonaDBSingleThread,
22+
PostGISSingleThread,
23+
)
2024

2125

2226
class TestBenchFunctions(TestBenchBase):
23-
@pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB])
27+
@pytest.mark.parametrize(
28+
"eng", [SedonaDBSingleThread, PostGISSingleThread, DuckDBSingleThread]
29+
)
2430
@pytest.mark.parametrize(
2531
"table",
2632
[
@@ -36,7 +42,9 @@ def queries():
3642

3743
benchmark(queries)
3844

39-
@pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB])
45+
@pytest.mark.parametrize(
46+
"eng", [SedonaDBSingleThread, PostGISSingleThread, DuckDBSingleThread]
47+
)
4048
@pytest.mark.parametrize(
4149
"table",
4250
[
@@ -51,7 +59,9 @@ def queries():
5159

5260
benchmark(queries)
5361

54-
@pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB])
62+
@pytest.mark.parametrize(
63+
"eng", [SedonaDBSingleThread, PostGISSingleThread, DuckDBSingleThread]
64+
)
5565
@pytest.mark.parametrize(
5666
"table",
5767
[
@@ -67,7 +77,9 @@ def queries():
6777

6878
benchmark(queries)
6979

70-
@pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB])
80+
@pytest.mark.parametrize(
81+
"eng", [SedonaDBSingleThread, PostGISSingleThread, DuckDBSingleThread]
82+
)
7183
@pytest.mark.parametrize(
7284
"table",
7385
[
@@ -83,7 +95,9 @@ def queries():
8395

8496
benchmark(queries)
8597

86-
@pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB])
98+
@pytest.mark.parametrize(
99+
"eng", [SedonaDBSingleThread, PostGISSingleThread, DuckDBSingleThread]
100+
)
87101
@pytest.mark.parametrize(
88102
"table",
89103
[
@@ -99,7 +113,9 @@ def queries():
99113

100114
benchmark(queries)
101115

102-
@pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB])
116+
@pytest.mark.parametrize(
117+
"eng", [SedonaDBSingleThread, PostGISSingleThread, DuckDBSingleThread]
118+
)
103119
@pytest.mark.parametrize(
104120
"table",
105121
[
@@ -115,7 +131,9 @@ def queries():
115131

116132
benchmark(queries)
117133

118-
@pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB])
134+
@pytest.mark.parametrize(
135+
"eng", [SedonaDBSingleThread, PostGISSingleThread, DuckDBSingleThread]
136+
)
119137
@pytest.mark.parametrize(
120138
"table",
121139
[
@@ -131,7 +149,9 @@ def queries():
131149

132150
benchmark(queries)
133151

134-
@pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB])
152+
@pytest.mark.parametrize(
153+
"eng", [SedonaDBSingleThread, PostGISSingleThread, DuckDBSingleThread]
154+
)
135155
@pytest.mark.parametrize(
136156
"table",
137157
[
@@ -147,7 +167,9 @@ def queries():
147167

148168
benchmark(queries)
149169

150-
@pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB])
170+
@pytest.mark.parametrize(
171+
"eng", [SedonaDBSingleThread, PostGISSingleThread, DuckDBSingleThread]
172+
)
151173
@pytest.mark.parametrize(
152174
"table",
153175
[
@@ -164,7 +186,9 @@ def queries():
164186

165187
benchmark(queries)
166188

167-
@pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB])
189+
@pytest.mark.parametrize(
190+
"eng", [SedonaDBSingleThread, PostGISSingleThread, DuckDBSingleThread]
191+
)
168192
@pytest.mark.parametrize(
169193
"table",
170194
[

benchmarks/test_overlay.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,17 @@
1616
# under the License.
1717
import pytest
1818
from test_bench_base import TestBenchBase
19-
from sedonadb.testing import DuckDB, PostGIS, SedonaDB
19+
from sedonadb.testing import (
20+
DuckDBSingleThread,
21+
PostGISSingleThread,
22+
SedonaDBSingleThread,
23+
)
2024

2125

2226
class TestBenchPredicates(TestBenchBase):
23-
@pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB])
27+
@pytest.mark.parametrize(
28+
"eng", [SedonaDBSingleThread, PostGISSingleThread, DuckDBSingleThread]
29+
)
2430
@pytest.mark.parametrize(
2531
"table",
2632
[

benchmarks/test_predicates.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,17 @@
1616
# under the License.
1717
import pytest
1818
from test_bench_base import TestBenchBase
19-
from sedonadb.testing import DuckDB, PostGIS, SedonaDB
19+
from sedonadb.testing import (
20+
DuckDBSingleThread,
21+
PostGISSingleThread,
22+
SedonaDBSingleThread,
23+
)
2024

2125

2226
class TestBenchPredicates(TestBenchBase):
23-
@pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB])
27+
@pytest.mark.parametrize(
28+
"eng", [SedonaDBSingleThread, PostGISSingleThread, DuckDBSingleThread]
29+
)
2430
@pytest.mark.parametrize(
2531
"table",
2632
[
@@ -36,7 +42,9 @@ def queries():
3642

3743
benchmark(queries)
3844

39-
@pytest.mark.parametrize("eng", [SedonaDB, PostGIS, DuckDB])
45+
@pytest.mark.parametrize(
46+
"eng", [SedonaDBSingleThread, PostGISSingleThread, DuckDBSingleThread]
47+
)
4048
@pytest.mark.parametrize(
4149
"table",
4250
[

python/sedonadb/python/sedonadb/testing.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,15 @@ def execute_and_collect(self, query) -> "sedonadb.dataframe.DataFrame":
344344
return self.con.sql(query).to_arrow_table()
345345

346346

347+
class SedonaDBSingleThread(SedonaDB):
348+
"""SedonaDB configured for single-threaded execution"""
349+
350+
def __init__(self):
351+
super().__init__()
352+
# Force single-threaded execution
353+
self.con.sql("SET datafusion.execution.target_partitions TO 1")
354+
355+
347356
class DuckDB(DBEngine):
348357
"""A DuckDB implementation of the DBEngine using DuckDB Python"""
349358

@@ -395,6 +404,14 @@ def execute_and_collect(self, query) -> pa.Table:
395404
return self.con.sql(query).fetch_arrow_table()
396405

397406

407+
class DuckDBSingleThread(DuckDB):
408+
"""DuckDB configured for single-threaded execution"""
409+
410+
def __init__(self):
411+
super().__init__()
412+
self.con.sql("SET threads TO 1")
413+
414+
398415
class PostGIS(DBEngine):
399416
"""A PostGIS implementation of the DBEngine using ADBC
400417
@@ -598,6 +615,15 @@ def _insert_srid_if_needed(self, crs):
598615
return col_srid
599616

600617

618+
class PostGISSingleThread(PostGIS):
619+
"""PostGIS configured for single-threaded (no parallel workers) execution"""
620+
621+
def __init__(self, uri=None):
622+
super().__init__(uri)
623+
with self.con.cursor() as cur:
624+
cur.execute("SET max_parallel_workers_per_gather TO 0")
625+
626+
601627
def geom_or_null(arg):
602628
"""Format SQL expression for a geometry object or NULL"""
603629
if arg is None:

0 commit comments

Comments
 (0)