Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ NUM_PARTITIONS=10
NUM_BATCHES?=1 ## data split into this number of batches, more batches reduce disk space required for temporary tbl files
PARALLELISM?=8 ## number of parallel data generation processes, can be 1, unless NUM_BATCHES is 1

# Default scale factor for data generation (set to override via environment)
SCALE_FACTOR ?= 1.0

.venv: ## Set up Python virtual environment and install dependencies
python3 -m venv $(VENV)
$(MAKE) install-deps
Expand Down Expand Up @@ -63,7 +66,6 @@ data/tables/scale-$(SCALE_FACTOR): .venv ## Generate data tables
# mkdir -p "data/tables/scale-$(SCALE_FACTOR)"
# mv tpch-dbgen/*.tbl data/tables/scale-$(SCALE_FACTOR)/
# $(VENV_BIN)/python -m scripts.prepare_data --tpch_gen_folder="data/tables/scale-$(SCALE_FACTOR)"
rm -rf data/tables/scale-$(SCALE_FACTOR)/*.tbl

.PHONY: data-tables-partitioned
data-tables-partitioned: data/tables/scale-$(SCALE_FACTOR)/${NUM_PARTITIONS}
Expand Down Expand Up @@ -113,8 +115,12 @@ run-dask: .venv data-tables ## Run Dask benchmarks
run-modin: .venv data-tables ## Run Modin benchmarks
$(VENV_BIN)/python -m queries.modin

.PHONY: run-exasol
run-exasol: .venv data-tables ## Run Exasol benchmarks
$(VENV_BIN)/python -m queries.exasol

.PHONY: run-all
run-all: run-polars run-duckdb run-pandas run-pyspark run-dask run-modin ## Run all benchmarks
run-all: run-polars run-duckdb run-pandas run-pyspark run-dask run-modin run-exasol ## Run all benchmarks

.PHONY: plot
plot: .venv ## Plot results
Expand Down
27 changes: 27 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,30 @@ This will do the following,
- Create a new virtual environment with all required dependencies.
- Generate data for benchmarks.
- Run the benchmark suite.

> **Note:** To run the Exasol benchmarks, ensure your Exasol database is configured via environment variables (or a `.env` file) with the following settings:
>
> ```
> EXASOL_HOST=<host>
> EXASOL_USER=<user>
> EXASOL_PASSWORD=<password>
> (optional) EXASOL_PORT=<port> # default: 8563
> (optional) EXASOL_SCHEMA_NAME=<schema> # default: tpc
> ```
>
> You can set SCALE_FACTOR=<scale> (default: 1.0) to control the data scale.
> Then execute:
>
> ```shell
> make run-exasol
> ```
>
> The Exasol runner executes the DDL scripts in three phases:
>
> 1. `create_schema.sql` to create empty TPC-H tables.
> 2. Import data files into these tables from the local filesystem using Exasol's
> `IMPORT FROM` functionality (via `pyexasol.import_from_file`).
> 3. Run `create_indices_1node.sql` and `analyze_database.sql` to enforce indices
> and collect statistics.
>
> Finally, it executes the 22 TPC-H queries.
Empty file added queries/exasol/__init__.py
Empty file.
12 changes: 12 additions & 0 deletions queries/exasol/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from queries.common_utils import execute_all
from queries.exasol.utils import prepare_load_data, prepare_postload, prepare_schema

if __name__ == "__main__":
# create schema before loading data
prepare_schema()
# load data files into Exasol tables
prepare_load_data()
# enforce indices and collect statistics after data load
prepare_postload()
# run the TPC-H queries
execute_all("exasol")
38 changes: 38 additions & 0 deletions queries/exasol/q1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from queries.exasol import utils

Q_NUM = 1


def q() -> None:

line_item_ds = utils.get_line_item_ds()
query_str = f"""
select
l_returnflag,
l_linestatus,
sum(l_quantity) as sum_qty,
sum(l_extendedprice) as sum_base_price,
sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
avg(l_quantity) as avg_qty,
avg(l_extendedprice) as avg_price,
avg(l_discount) as avg_disc,
count(*) as count_order
from
{line_item_ds}
where
l_shipdate <= '1998-09-02'
group by
l_returnflag,
l_linestatus
order by
l_returnflag,
l_linestatus
;
"""
utils.run_query(Q_NUM, query_str)


if __name__ == "__main__":
q()

52 changes: 52 additions & 0 deletions queries/exasol/q10.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from queries.exasol import utils

Q_NUM = 10


def q() -> None:

customer_ds = utils.get_customer_ds()
line_item_ds = utils.get_line_item_ds()
nation_ds = utils.get_nation_ds()
orders_ds = utils.get_orders_ds()
query_str = f"""
select
c_custkey,
c_name,
round(sum(l_extendedprice * (1 - l_discount)), 2) as revenue,
c_acctbal,
n_name,
c_address,
c_phone,
c_comment
from
{customer_ds},
{orders_ds},
{line_item_ds},
{nation_ds}
where
c_custkey = o_custkey
and l_orderkey = o_orderkey
and o_orderdate >= date '1993-10-01'
and o_orderdate < date '1993-10-01' + interval '3' month
and l_returnflag = 'R'
and c_nationkey = n_nationkey
group by
c_custkey,
c_name,
c_acctbal,
c_phone,
n_name,
c_address,
c_comment
order by
revenue desc
limit 20
;
"""
utils.run_query(Q_NUM, query_str)


if __name__ == "__main__":
q()

46 changes: 46 additions & 0 deletions queries/exasol/q11.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from queries.exasol import utils

Q_NUM = 11


def q() -> None:

nation_ds = utils.get_nation_ds()
part_supp_ds = utils.get_part_supp_ds()
supplier_ds = utils.get_supplier_ds()
query_str = f"""
select
ps_partkey,
round(sum(ps_supplycost * ps_availqty), 2) as "value"
from
{part_supp_ds},
{supplier_ds},
{nation_ds}
where
ps_suppkey = s_suppkey
and s_nationkey = n_nationkey
and n_name = 'GERMANY'
group by
ps_partkey having
sum(ps_supplycost * ps_availqty) > (
select
sum(ps_supplycost * ps_availqty) * 0.0001
from
{part_supp_ds},
{supplier_ds},
{nation_ds}
where
ps_suppkey = s_suppkey
and s_nationkey = n_nationkey
and n_name = 'GERMANY'
)
order by
"value" desc
;
"""
utils.run_query(Q_NUM, query_str)


if __name__ == "__main__":
q()

46 changes: 46 additions & 0 deletions queries/exasol/q12.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from queries.exasol import utils

Q_NUM = 12


def q() -> None:

line_item_ds = utils.get_line_item_ds()
orders_ds = utils.get_orders_ds()
query_str = f"""
select
l_shipmode,
sum(case
when o_orderpriority = '1-URGENT'
or o_orderpriority = '2-HIGH'
then 1
else 0
end) as high_line_count,
sum(case
when o_orderpriority <> '1-URGENT'
and o_orderpriority <> '2-HIGH'
then 1
else 0
end) as low_line_count
from
{orders_ds},
{line_item_ds}
where
o_orderkey = l_orderkey
and l_shipmode in ('MAIL', 'SHIP')
and l_commitdate < l_receiptdate
and l_shipdate < l_commitdate
and l_receiptdate >= date '1994-01-01'
and l_receiptdate < date '1994-01-01' + interval '1' year
group by
l_shipmode
order by
l_shipmode
;
"""
utils.run_query(Q_NUM, query_str)


if __name__ == "__main__":
q()

36 changes: 36 additions & 0 deletions queries/exasol/q13.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from queries.exasol import utils

Q_NUM = 13


def q() -> None:

customer_ds = utils.get_customer_ds()
orders_ds = utils.get_orders_ds()
query_str = f"""
select
c_count, count(*) as custdist
from (
select
c_custkey,
count(o_orderkey) as c_count
from
{customer_ds} left outer join {orders_ds} on
c_custkey = o_custkey
and o_comment not like '%special%requests%'
group by
c_custkey
) c_orders
group by
c_count
order by
custdist desc,
c_count desc
;
"""
utils.run_query(Q_NUM, query_str)


if __name__ == "__main__":
q()

31 changes: 31 additions & 0 deletions queries/exasol/q14.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from queries.exasol import utils

Q_NUM = 14


def q() -> None:

line_item_ds = utils.get_line_item_ds()
part_ds = utils.get_part_ds()
query_str = f"""
select
round(100.00 * sum(case
when p_type like 'PROMO%'
then l_extendedprice * (1 - l_discount)
else 0
end) / sum(l_extendedprice * (1 - l_discount)), 2) as promo_revenue
from
{line_item_ds},
{part_ds}
where
l_partkey = p_partkey
and l_shipdate >= date '1995-09-01'
and l_shipdate < date '1995-09-01' + interval '1' month
;
"""
utils.run_query(Q_NUM, query_str)


if __name__ == "__main__":
q()

39 changes: 39 additions & 0 deletions queries/exasol/q15.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from queries.exasol import utils

Q_NUM = 15


def q() -> None:

supplier_ds = utils.get_supplier_ds()
line_item_ds = utils.get_line_item_ds()
query_str = f"""
WITH revenue AS (
SELECT
l_suppkey AS supplier_no,
SUM(l_extendedprice * (1 - l_discount)) AS total_revenue
FROM {line_item_ds}
WHERE l_shipdate >= DATE '1996-01-01'
AND l_shipdate < DATE '1996-01-01' + INTERVAL '3' MONTH
GROUP BY l_suppkey
)
SELECT
s_suppkey,
s_name,
s_address,
s_phone,
total_revenue
FROM {supplier_ds}, revenue
WHERE s_suppkey = supplier_no
AND total_revenue = (
SELECT MAX(total_revenue) FROM revenue
)
ORDER BY s_suppkey
;
"""
utils.run_query(Q_NUM, query_str)


if __name__ == "__main__":
q()

Loading