diff --git a/Makefile b/Makefile index 6b4e324..26c5742 100644 --- a/Makefile +++ b/Makefile @@ -10,6 +10,9 @@ NUM_PARTITIONS=10 NUM_BATCHES?=1 ## data split into this number of batches, more batches reduce disk space required for temporary tbl files PARALLELISM?=8 ## number of parallel data generation processes, can be 1, unless NUM_BATCHES is 1 +# Default scale factor for data generation (set to override via environment) +SCALE_FACTOR ?= 1.0 + .venv: ## Set up Python virtual environment and install dependencies python3 -m venv $(VENV) $(MAKE) install-deps @@ -63,7 +66,6 @@ data/tables/scale-$(SCALE_FACTOR): .venv ## Generate data tables # mkdir -p "data/tables/scale-$(SCALE_FACTOR)" # mv tpch-dbgen/*.tbl data/tables/scale-$(SCALE_FACTOR)/ # $(VENV_BIN)/python -m scripts.prepare_data --tpch_gen_folder="data/tables/scale-$(SCALE_FACTOR)" - rm -rf data/tables/scale-$(SCALE_FACTOR)/*.tbl .PHONY: data-tables-partitioned data-tables-partitioned: data/tables/scale-$(SCALE_FACTOR)/${NUM_PARTITIONS} @@ -113,8 +115,12 @@ run-dask: .venv data-tables ## Run Dask benchmarks run-modin: .venv data-tables ## Run Modin benchmarks $(VENV_BIN)/python -m queries.modin +.PHONY: run-exasol +run-exasol: .venv data-tables ## Run Exasol benchmarks + $(VENV_BIN)/python -m queries.exasol + .PHONY: run-all -run-all: run-polars run-duckdb run-pandas run-pyspark run-dask run-modin ## Run all benchmarks +run-all: run-polars run-duckdb run-pandas run-pyspark run-dask run-modin run-exasol ## Run all benchmarks .PHONY: plot plot: .venv ## Plot results diff --git a/README.md b/README.md index c5d7187..d37ad1c 100644 --- a/README.md +++ b/README.md @@ -43,3 +43,30 @@ This will do the following, - Create a new virtual environment with all required dependencies. - Generate data for benchmarks. - Run the benchmark suite. + +> **Note:** To run the Exasol benchmarks, ensure your Exasol database is configured via environment variables (or a `.env` file) with the following settings: +> +> ``` +> EXASOL_HOST= +> EXASOL_USER= +> EXASOL_PASSWORD= +> (optional) EXASOL_PORT= # default: 8563 +> (optional) EXASOL_SCHEMA_NAME= # default: tpc +> ``` +> +> You can set SCALE_FACTOR= (default: 1.0) to control the data scale. +> Then execute: +> +> ```shell +> make run-exasol +> ``` +> +> The Exasol runner executes the DDL scripts in three phases: +> +> 1. `create_schema.sql` to create empty TPC-H tables. +> 2. Import data files into these tables from the local filesystem using Exasol's +> `IMPORT FROM` functionality (via `pyexasol.import_from_file`). +> 3. Run `create_indices_1node.sql` and `analyze_database.sql` to enforce indices +> and collect statistics. +> +> Finally, it executes the 22 TPC-H queries. diff --git a/queries/exasol/__init__.py b/queries/exasol/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/queries/exasol/__main__.py b/queries/exasol/__main__.py new file mode 100644 index 0000000..9a5f76e --- /dev/null +++ b/queries/exasol/__main__.py @@ -0,0 +1,12 @@ +from queries.common_utils import execute_all +from queries.exasol.utils import prepare_load_data, prepare_postload, prepare_schema + +if __name__ == "__main__": + # create schema before loading data + prepare_schema() + # load data files into Exasol tables + prepare_load_data() + # enforce indices and collect statistics after data load + prepare_postload() + # run the TPC-H queries + execute_all("exasol") diff --git a/queries/exasol/q1.py b/queries/exasol/q1.py new file mode 100644 index 0000000..e2d14e8 --- /dev/null +++ b/queries/exasol/q1.py @@ -0,0 +1,38 @@ +from queries.exasol import utils + +Q_NUM = 1 + + +def q() -> None: + + line_item_ds = utils.get_line_item_ds() + query_str = f""" + select + l_returnflag, + l_linestatus, + sum(l_quantity) as sum_qty, + sum(l_extendedprice) as sum_base_price, + sum(l_extendedprice * (1 - l_discount)) as sum_disc_price, + sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge, + avg(l_quantity) as avg_qty, + avg(l_extendedprice) as avg_price, + avg(l_discount) as avg_disc, + count(*) as count_order + from + {line_item_ds} + where + l_shipdate <= '1998-09-02' + group by + l_returnflag, + l_linestatus + order by + l_returnflag, + l_linestatus + ; + """ + utils.run_query(Q_NUM, query_str) + + +if __name__ == "__main__": + q() + diff --git a/queries/exasol/q10.py b/queries/exasol/q10.py new file mode 100644 index 0000000..5e6f4c4 --- /dev/null +++ b/queries/exasol/q10.py @@ -0,0 +1,52 @@ +from queries.exasol import utils + +Q_NUM = 10 + + +def q() -> None: + + customer_ds = utils.get_customer_ds() + line_item_ds = utils.get_line_item_ds() + nation_ds = utils.get_nation_ds() + orders_ds = utils.get_orders_ds() + query_str = f""" + select + c_custkey, + c_name, + round(sum(l_extendedprice * (1 - l_discount)), 2) as revenue, + c_acctbal, + n_name, + c_address, + c_phone, + c_comment + from + {customer_ds}, + {orders_ds}, + {line_item_ds}, + {nation_ds} + where + c_custkey = o_custkey + and l_orderkey = o_orderkey + and o_orderdate >= date '1993-10-01' + and o_orderdate < date '1993-10-01' + interval '3' month + and l_returnflag = 'R' + and c_nationkey = n_nationkey + group by + c_custkey, + c_name, + c_acctbal, + c_phone, + n_name, + c_address, + c_comment + order by + revenue desc + limit 20 + ; + """ + utils.run_query(Q_NUM, query_str) + + +if __name__ == "__main__": + q() + diff --git a/queries/exasol/q11.py b/queries/exasol/q11.py new file mode 100644 index 0000000..5aa4cbe --- /dev/null +++ b/queries/exasol/q11.py @@ -0,0 +1,46 @@ +from queries.exasol import utils + +Q_NUM = 11 + + +def q() -> None: + + nation_ds = utils.get_nation_ds() + part_supp_ds = utils.get_part_supp_ds() + supplier_ds = utils.get_supplier_ds() + query_str = f""" + select + ps_partkey, + round(sum(ps_supplycost * ps_availqty), 2) as "value" + from + {part_supp_ds}, + {supplier_ds}, + {nation_ds} + where + ps_suppkey = s_suppkey + and s_nationkey = n_nationkey + and n_name = 'GERMANY' + group by + ps_partkey having + sum(ps_supplycost * ps_availqty) > ( + select + sum(ps_supplycost * ps_availqty) * 0.0001 + from + {part_supp_ds}, + {supplier_ds}, + {nation_ds} + where + ps_suppkey = s_suppkey + and s_nationkey = n_nationkey + and n_name = 'GERMANY' + ) + order by + "value" desc + ; + """ + utils.run_query(Q_NUM, query_str) + + +if __name__ == "__main__": + q() + diff --git a/queries/exasol/q12.py b/queries/exasol/q12.py new file mode 100644 index 0000000..f8af845 --- /dev/null +++ b/queries/exasol/q12.py @@ -0,0 +1,46 @@ +from queries.exasol import utils + +Q_NUM = 12 + + +def q() -> None: + + line_item_ds = utils.get_line_item_ds() + orders_ds = utils.get_orders_ds() + query_str = f""" + select + l_shipmode, + sum(case + when o_orderpriority = '1-URGENT' + or o_orderpriority = '2-HIGH' + then 1 + else 0 + end) as high_line_count, + sum(case + when o_orderpriority <> '1-URGENT' + and o_orderpriority <> '2-HIGH' + then 1 + else 0 + end) as low_line_count + from + {orders_ds}, + {line_item_ds} + where + o_orderkey = l_orderkey + and l_shipmode in ('MAIL', 'SHIP') + and l_commitdate < l_receiptdate + and l_shipdate < l_commitdate + and l_receiptdate >= date '1994-01-01' + and l_receiptdate < date '1994-01-01' + interval '1' year + group by + l_shipmode + order by + l_shipmode + ; + """ + utils.run_query(Q_NUM, query_str) + + +if __name__ == "__main__": + q() + diff --git a/queries/exasol/q13.py b/queries/exasol/q13.py new file mode 100644 index 0000000..623a1ff --- /dev/null +++ b/queries/exasol/q13.py @@ -0,0 +1,36 @@ +from queries.exasol import utils + +Q_NUM = 13 + + +def q() -> None: + + customer_ds = utils.get_customer_ds() + orders_ds = utils.get_orders_ds() + query_str = f""" + select + c_count, count(*) as custdist + from ( + select + c_custkey, + count(o_orderkey) as c_count + from + {customer_ds} left outer join {orders_ds} on + c_custkey = o_custkey + and o_comment not like '%special%requests%' + group by + c_custkey + ) c_orders + group by + c_count + order by + custdist desc, + c_count desc + ; + """ + utils.run_query(Q_NUM, query_str) + + +if __name__ == "__main__": + q() + diff --git a/queries/exasol/q14.py b/queries/exasol/q14.py new file mode 100644 index 0000000..cad2ade --- /dev/null +++ b/queries/exasol/q14.py @@ -0,0 +1,31 @@ +from queries.exasol import utils + +Q_NUM = 14 + + +def q() -> None: + + line_item_ds = utils.get_line_item_ds() + part_ds = utils.get_part_ds() + query_str = f""" + select + round(100.00 * sum(case + when p_type like 'PROMO%' + then l_extendedprice * (1 - l_discount) + else 0 + end) / sum(l_extendedprice * (1 - l_discount)), 2) as promo_revenue + from + {line_item_ds}, + {part_ds} + where + l_partkey = p_partkey + and l_shipdate >= date '1995-09-01' + and l_shipdate < date '1995-09-01' + interval '1' month + ; + """ + utils.run_query(Q_NUM, query_str) + + +if __name__ == "__main__": + q() + diff --git a/queries/exasol/q15.py b/queries/exasol/q15.py new file mode 100644 index 0000000..ff6f6e6 --- /dev/null +++ b/queries/exasol/q15.py @@ -0,0 +1,39 @@ +from queries.exasol import utils + +Q_NUM = 15 + + +def q() -> None: + + supplier_ds = utils.get_supplier_ds() + line_item_ds = utils.get_line_item_ds() + query_str = f""" + WITH revenue AS ( + SELECT + l_suppkey AS supplier_no, + SUM(l_extendedprice * (1 - l_discount)) AS total_revenue + FROM {line_item_ds} + WHERE l_shipdate >= DATE '1996-01-01' + AND l_shipdate < DATE '1996-01-01' + INTERVAL '3' MONTH + GROUP BY l_suppkey + ) + SELECT + s_suppkey, + s_name, + s_address, + s_phone, + total_revenue + FROM {supplier_ds}, revenue + WHERE s_suppkey = supplier_no + AND total_revenue = ( + SELECT MAX(total_revenue) FROM revenue + ) + ORDER BY s_suppkey + ; + """ + utils.run_query(Q_NUM, query_str) + + +if __name__ == "__main__": + q() + diff --git a/queries/exasol/q16.py b/queries/exasol/q16.py new file mode 100644 index 0000000..8050161 --- /dev/null +++ b/queries/exasol/q16.py @@ -0,0 +1,49 @@ +from queries.exasol import utils + +Q_NUM = 16 + + +def q() -> None: + + part_ds = utils.get_part_ds() + part_supp_ds = utils.get_part_supp_ds() + supplier_ds = utils.get_supplier_ds() + query_str = f""" + select + p_brand, + p_type, + p_size, + count(distinct ps_suppkey) as supplier_cnt + from + {part_supp_ds}, + {part_ds} + where + p_partkey = ps_partkey + and p_brand <> 'Brand#45' + and p_type not like 'MEDIUM POLISHED%' + and p_size in (49, 14, 23, 45, 19, 3, 36, 9) + and ps_suppkey not in ( + select + s_suppkey + from + {supplier_ds} + where + s_comment like '%Customer%Complaints%' + ) + group by + p_brand, + p_type, + p_size + order by + supplier_cnt desc, + p_brand, + p_type, + p_size + ; + """ + utils.run_query(Q_NUM, query_str) + + +if __name__ == "__main__": + q() + diff --git a/queries/exasol/q17.py b/queries/exasol/q17.py new file mode 100644 index 0000000..efc7a9d --- /dev/null +++ b/queries/exasol/q17.py @@ -0,0 +1,35 @@ +from queries.exasol import utils + +Q_NUM = 17 + + +def q() -> None: + + line_item_ds = utils.get_line_item_ds() + part_ds = utils.get_part_ds() + query_str = f""" + select + round(sum(l_extendedprice) / 7.0, 2) as avg_yearly + from + {line_item_ds}, + {part_ds} + where + p_partkey = l_partkey + and p_brand = 'Brand#23' + and p_container = 'MED BOX' + and l_quantity < ( + select + 0.2 * avg(l_quantity) + from + {line_item_ds} + where + l_partkey = p_partkey + ) + ; + """ + utils.run_query(Q_NUM, query_str) + + +if __name__ == "__main__": + q() + diff --git a/queries/exasol/q18.py b/queries/exasol/q18.py new file mode 100644 index 0000000..5b5ee7b --- /dev/null +++ b/queries/exasol/q18.py @@ -0,0 +1,52 @@ +from queries.exasol import utils + +Q_NUM = 18 + + +def q() -> None: + + customer_ds = utils.get_customer_ds() + line_item_ds = utils.get_line_item_ds() + orders_ds = utils.get_orders_ds() + query_str = f""" + select + c_name, + c_custkey, + o_orderkey, + o_orderdate as o_orderdat, + o_totalprice, + sum(l_quantity) as col6 + from + {customer_ds}, + {orders_ds}, + {line_item_ds} + where + o_orderkey in ( + select + l_orderkey + from + {line_item_ds} + group by + l_orderkey having + sum(l_quantity) > 300 + ) + and c_custkey = o_custkey + and o_orderkey = l_orderkey + group by + c_name, + c_custkey, + o_orderkey, + o_orderdate, + o_totalprice + order by + o_totalprice desc, + o_orderdate + limit 100 + ; + """ + utils.run_query(Q_NUM, query_str) + + +if __name__ == "__main__": + q() + diff --git a/queries/exasol/q19.py b/queries/exasol/q19.py new file mode 100644 index 0000000..9c34ee0 --- /dev/null +++ b/queries/exasol/q19.py @@ -0,0 +1,53 @@ +from queries.exasol import utils + +Q_NUM = 19 + + +def q() -> None: + + line_item_ds = utils.get_line_item_ds() + part_ds = utils.get_part_ds() + query_str = f""" + select + round(sum(l_extendedprice* (1 - l_discount)), 2) as revenue + from + {line_item_ds}, + {part_ds} + where + ( + p_partkey = l_partkey + and p_brand = 'Brand#12' + and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') + and l_quantity >= 1 and l_quantity <= 1 + 10 + and p_size between 1 and 5 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ) + or + ( + p_partkey = l_partkey + and p_brand = 'Brand#23' + and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK') + and l_quantity >= 10 and l_quantity <= 20 + and p_size between 1 and 10 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ) + or + ( + p_partkey = l_partkey + and p_brand = 'Brand#34' + and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG') + and l_quantity >= 20 and l_quantity <= 30 + and p_size between 1 and 15 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ) + ; + """ + utils.run_query(Q_NUM, query_str) + + +if __name__ == "__main__": + q() + diff --git a/queries/exasol/q2.py b/queries/exasol/q2.py new file mode 100644 index 0000000..bdb2eb4 --- /dev/null +++ b/queries/exasol/q2.py @@ -0,0 +1,65 @@ +from queries.exasol import utils + +Q_NUM = 2 + + +def q() -> None: + + nation_ds = utils.get_nation_ds() + part_ds = utils.get_part_ds() + part_supp_ds = utils.get_part_supp_ds() + region_ds = utils.get_region_ds() + supplier_ds = utils.get_supplier_ds() + query_str = f""" + select + s_acctbal, + s_name, + n_name, + p_partkey, + p_mfgr, + s_address, + s_phone, + s_comment + from + {part_ds}, + {supplier_ds}, + {part_supp_ds}, + {nation_ds}, + {region_ds} + where + p_partkey = ps_partkey + and s_suppkey = ps_suppkey + and p_size = 15 + and p_type like '%BRASS' + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = 'EUROPE' + and ps_supplycost = ( + select + min(ps_supplycost) + from + {part_supp_ds}, + {supplier_ds}, + {nation_ds}, + {region_ds} + where + p_partkey = ps_partkey + and s_suppkey = ps_suppkey + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = 'EUROPE' + ) + order by + s_acctbal desc, + n_name, + s_name, + p_partkey + limit 100 + ; + """ + utils.run_query(Q_NUM, query_str) + + +if __name__ == "__main__": + q() + diff --git a/queries/exasol/q20.py b/queries/exasol/q20.py new file mode 100644 index 0000000..9641fd3 --- /dev/null +++ b/queries/exasol/q20.py @@ -0,0 +1,58 @@ +from queries.exasol import utils + +Q_NUM = 20 + + +def q() -> None: + + line_item_ds = utils.get_line_item_ds() + nation_ds = utils.get_nation_ds() + part_ds = utils.get_part_ds() + part_supp_ds = utils.get_part_supp_ds() + supplier_ds = utils.get_supplier_ds() + query_str = f""" + select + s_name, + s_address + from + {supplier_ds}, + {nation_ds} + where + s_suppkey in ( + select + ps_suppkey + from + {part_supp_ds} + where + ps_partkey in ( + select + p_partkey + from + {part_ds} + where + p_name like 'forest%' + ) + and ps_availqty > ( + select + 0.5 * sum(l_quantity) + from + {line_item_ds} + where + l_partkey = ps_partkey + and l_suppkey = ps_suppkey + and l_shipdate >= date '1994-01-01' + and l_shipdate < date '1994-01-01' + interval '1' year + ) + ) + and s_nationkey = n_nationkey + and n_name = 'CANADA' + order by + s_name + ; + """ + utils.run_query(Q_NUM, query_str) + + +if __name__ == "__main__": + q() + diff --git a/queries/exasol/q21.py b/queries/exasol/q21.py new file mode 100644 index 0000000..1af88ff --- /dev/null +++ b/queries/exasol/q21.py @@ -0,0 +1,60 @@ +from queries.exasol import utils + +Q_NUM = 21 + + +def q() -> None: + + line_item_ds = utils.get_line_item_ds() + nation_ds = utils.get_nation_ds() + orders_ds = utils.get_orders_ds() + supplier_ds = utils.get_supplier_ds() + query_str = f""" + select + s_name, + count(*) as numwait + from + {supplier_ds}, + {line_item_ds} l1, + {orders_ds}, + {nation_ds} + where + s_suppkey = l1.l_suppkey + and o_orderkey = l1.l_orderkey + and o_orderstatus = 'F' + and l1.l_receiptdate > l1.l_commitdate + and exists ( + select + * + from + {line_item_ds} l2 + where + l2.l_orderkey = l1.l_orderkey + and l2.l_suppkey <> l1.l_suppkey + ) + and not exists ( + select + * + from + {line_item_ds} l3 + where + l3.l_orderkey = l1.l_orderkey + and l3.l_suppkey <> l1.l_suppkey + and l3.l_receiptdate > l3.l_commitdate + ) + and s_nationkey = n_nationkey + and n_name = 'SAUDI ARABIA' + group by + s_name + order by + numwait desc, + s_name + limit 100 + ; + """ + utils.run_query(Q_NUM, query_str) + + +if __name__ == "__main__": + q() + diff --git a/queries/exasol/q22.py b/queries/exasol/q22.py new file mode 100644 index 0000000..bf9faa8 --- /dev/null +++ b/queries/exasol/q22.py @@ -0,0 +1,54 @@ +from queries.exasol import utils + +Q_NUM = 22 + + +def q() -> None: + + customer_ds = utils.get_customer_ds() + orders_ds = utils.get_orders_ds() + query_str = f""" + select + cntrycode, + count(*) as numcust, + sum(c_acctbal) as totacctbal + from ( + select + substring(c_phone from 1 for 2) as cntrycode, + c_acctbal + from + {customer_ds} + where + substring(c_phone from 1 for 2) in + (13, 31, 23, 29, 30, 18, 17) + and c_acctbal > ( + select + avg(c_acctbal) + from + {customer_ds} + where + c_acctbal > 0.00 + and substring (c_phone from 1 for 2) in + (13, 31, 23, 29, 30, 18, 17) + ) + and not exists ( + select + * + from + {orders_ds} + where + o_custkey = c_custkey + ) + ) as custsale + group by + cntrycode + order by + cntrycode + ; + """ + utils.run_query(Q_NUM, query_str) + + +if __name__ == "__main__": + q() + diff --git a/queries/exasol/q3.py b/queries/exasol/q3.py new file mode 100644 index 0000000..8ccd6f8 --- /dev/null +++ b/queries/exasol/q3.py @@ -0,0 +1,42 @@ +from queries.exasol import utils + +Q_NUM = 3 + + +def q() -> None: + + customer_ds = utils.get_customer_ds() + line_item_ds = utils.get_line_item_ds() + orders_ds = utils.get_orders_ds() + query_str = f""" + select + l_orderkey, + sum(l_extendedprice * (1 - l_discount)) as revenue, + o_orderdate, + o_shippriority + from + {customer_ds}, + {orders_ds}, + {line_item_ds} + where + c_mktsegment = 'BUILDING' + and c_custkey = o_custkey + and l_orderkey = o_orderkey + and o_orderdate < '1995-03-15' + and l_shipdate > '1995-03-15' + group by + l_orderkey, + o_orderdate, + o_shippriority + order by + revenue desc, + o_orderdate + limit 10 + ; + """ + utils.run_query(Q_NUM, query_str) + + +if __name__ == "__main__": + q() + diff --git a/queries/exasol/q4.py b/queries/exasol/q4.py new file mode 100644 index 0000000..658ab6e --- /dev/null +++ b/queries/exasol/q4.py @@ -0,0 +1,39 @@ +from queries.exasol import utils + +Q_NUM = 4 + + +def q() -> None: + + line_item_ds = utils.get_line_item_ds() + orders_ds = utils.get_orders_ds() + query_str = f""" + select + o_orderpriority, + count(*) as order_count + from + {orders_ds} + where + o_orderdate >= timestamp '1993-07-01' + and o_orderdate < timestamp '1993-07-01' + interval '3' month + and exists ( + select + * + from + {line_item_ds} + where + l_orderkey = o_orderkey + and l_commitdate < l_receiptdate + ) + group by + o_orderpriority + order by + o_orderpriority + ; + """ + utils.run_query(Q_NUM, query_str) + + +if __name__ == "__main__": + q() + diff --git a/queries/exasol/q5.py b/queries/exasol/q5.py new file mode 100644 index 0000000..be050d1 --- /dev/null +++ b/queries/exasol/q5.py @@ -0,0 +1,46 @@ +from queries.exasol import utils + +Q_NUM = 5 + + +def q() -> None: + + customer_ds = utils.get_customer_ds() + line_item_ds = utils.get_line_item_ds() + nation_ds = utils.get_nation_ds() + orders_ds = utils.get_orders_ds() + region_ds = utils.get_region_ds() + supplier_ds = utils.get_supplier_ds() + query_str = f""" + select + n_name, + sum(l_extendedprice * (1 - l_discount)) as revenue + from + {customer_ds}, + {orders_ds}, + {line_item_ds}, + {supplier_ds}, + {nation_ds}, + {region_ds} + where + c_custkey = o_custkey + and l_orderkey = o_orderkey + and l_suppkey = s_suppkey + and c_nationkey = s_nationkey + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = 'ASIA' + and o_orderdate >= timestamp '1994-01-01' + and o_orderdate < timestamp '1994-01-01' + interval '1' year + group by + n_name + order by + revenue desc + ; + """ + utils.run_query(Q_NUM, query_str) + + +if __name__ == "__main__": + q() + diff --git a/queries/exasol/q6.py b/queries/exasol/q6.py new file mode 100644 index 0000000..52d1d95 --- /dev/null +++ b/queries/exasol/q6.py @@ -0,0 +1,26 @@ +from queries.exasol import utils + +Q_NUM = 6 + + +def q() -> None: + + line_item_ds = utils.get_line_item_ds() + query_str = f""" + select + sum(l_extendedprice * l_discount) as revenue + from + {line_item_ds} + where + l_shipdate >= timestamp '1994-01-01' + and l_shipdate < timestamp '1994-01-01' + interval '1' year + and l_discount between .06 - 0.01 and .06 + 0.01 + and l_quantity < 24 + ; + """ + utils.run_query(Q_NUM, query_str) + + +if __name__ == "__main__": + q() + diff --git a/queries/exasol/q7.py b/queries/exasol/q7.py new file mode 100644 index 0000000..8f1302e --- /dev/null +++ b/queries/exasol/q7.py @@ -0,0 +1,60 @@ +from queries.exasol import utils + +Q_NUM = 7 + + +def q() -> None: + + customer_ds = utils.get_customer_ds() + line_item_ds = utils.get_line_item_ds() + nation_ds = utils.get_nation_ds() + orders_ds = utils.get_orders_ds() + supplier_ds = utils.get_supplier_ds() + query_str = f""" + select + supp_nation, + cust_nation, + l_year, + sum(volume) as revenue + from + ( + select + n1.n_name as supp_nation, + n2.n_name as cust_nation, + year(l_shipdate) as l_year, + l_extendedprice * (1 - l_discount) as volume + from + {supplier_ds}, + {line_item_ds}, + {orders_ds}, + {customer_ds}, + {nation_ds} n1, + {nation_ds} n2 + where + s_suppkey = l_suppkey + and o_orderkey = l_orderkey + and c_custkey = o_custkey + and s_nationkey = n1.n_nationkey + and c_nationkey = n2.n_nationkey + and ( + (n1.n_name = 'FRANCE' and n2.n_name = 'GERMANY') + or (n1.n_name = 'GERMANY' and n2.n_name = 'FRANCE') + ) + and l_shipdate between timestamp '1995-01-01' and timestamp '1996-12-31' + ) as shipping + group by + supp_nation, + cust_nation, + l_year + order by + supp_nation, + cust_nation, + l_year + ; + """ + utils.run_query(Q_NUM, query_str) + + +if __name__ == "__main__": + q() + diff --git a/queries/exasol/q8.py b/queries/exasol/q8.py new file mode 100644 index 0000000..86b18a4 --- /dev/null +++ b/queries/exasol/q8.py @@ -0,0 +1,62 @@ +from queries.exasol import utils + +Q_NUM = 8 + + +def q() -> None: + + customer_ds = utils.get_customer_ds() + line_item_ds = utils.get_line_item_ds() + nation_ds = utils.get_nation_ds() + orders_ds = utils.get_orders_ds() + part_ds = utils.get_part_ds() + region_ds = utils.get_region_ds() + supplier_ds = utils.get_supplier_ds() + query_str = f""" + select + o_year, + round( + sum(case + when nation = 'BRAZIL' then volume + else 0 + end) / sum(volume) + , 2) as mkt_share + from + ( + select + extract(year from o_orderdate) as o_year, + l_extendedprice * (1 - l_discount) as volume, + n2.n_name as nation + from + {part_ds}, + {supplier_ds}, + {line_item_ds}, + {orders_ds}, + {customer_ds}, + {nation_ds} n1, + {nation_ds} n2, + {region_ds} + where + p_partkey = l_partkey + and s_suppkey = l_suppkey + and l_orderkey = o_orderkey + and o_custkey = c_custkey + and c_nationkey = n1.n_nationkey + and n1.n_regionkey = r_regionkey + and r_name = 'AMERICA' + and s_nationkey = n2.n_nationkey + and o_orderdate between timestamp '1995-01-01' and timestamp '1996-12-31' + and p_type = 'ECONOMY ANODIZED STEEL' + ) as all_nations + group by + o_year + order by + o_year + ; + """ + utils.run_query(Q_NUM, query_str) + + +if __name__ == "__main__": + q() + diff --git a/queries/exasol/q9.py b/queries/exasol/q9.py new file mode 100644 index 0000000..c23e452 --- /dev/null +++ b/queries/exasol/q9.py @@ -0,0 +1,54 @@ +from queries.exasol import utils + +Q_NUM = 9 + + +def q() -> None: + + line_item_ds = utils.get_line_item_ds() + nation_ds = utils.get_nation_ds() + orders_ds = utils.get_orders_ds() + part_ds = utils.get_part_ds() + part_supp_ds = utils.get_part_supp_ds() + supplier_ds = utils.get_supplier_ds() + query_str = f""" + select + nation, + o_year, + round(sum(amount), 2) as sum_profit + from + ( + select + n_name as nation, + year(o_orderdate) as o_year, + l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount + from + {part_ds}, + {supplier_ds}, + {line_item_ds}, + {part_supp_ds}, + {orders_ds}, + {nation_ds} + where + s_suppkey = l_suppkey + and ps_suppkey = l_suppkey + and ps_partkey = l_partkey + and p_partkey = l_partkey + and o_orderkey = l_orderkey + and s_nationkey = n_nationkey + and p_name like '%green%' + ) as profit + group by + nation, + o_year + order by + nation, + o_year desc + ; + """ + utils.run_query(Q_NUM, query_str) + + +if __name__ == "__main__": + q() + diff --git a/queries/exasol/queries/analyze_database.sql b/queries/exasol/queries/analyze_database.sql new file mode 100644 index 0000000..7e1ca03 --- /dev/null +++ b/queries/exasol/queries/analyze_database.sql @@ -0,0 +1,6 @@ +open schema tpc; + +analyze database estimate statistics; +commit; + +select to_char(current_timestamp, 'MM/DD/YYYY HH:MI:SS.FF3') 'END_OF_OPTIMIZATION' from dual; diff --git a/queries/exasol/queries/create_indices_1node.sql b/queries/exasol/queries/create_indices_1node.sql new file mode 100644 index 0000000..c53b213 --- /dev/null +++ b/queries/exasol/queries/create_indices_1node.sql @@ -0,0 +1,22 @@ +set autocommit off; +open schema tpc; + +enforce local index on lineitem (l_suppkey); +enforce local index on lineitem (l_partkey, l_suppkey); +enforce local index on lineitem (l_partkey); +enforce local index on lineitem (l_orderkey); +enforce local index on nation (n_nationkey); +enforce local index on region (r_regionkey); +enforce local index on supplier (s_suppkey); +enforce local index on supplier (s_nationkey); +enforce local index on customer (c_custkey); +enforce local index on customer (c_nationkey); +enforce local index on part (p_partkey); +enforce local index on partsupp (ps_partkey, ps_suppkey); +enforce local index on partsupp (ps_partkey); +enforce local index on partsupp (ps_suppkey); +enforce local index on orders (o_orderkey); +enforce local index on orders (o_custkey); +commit; + +select to_char(current_timestamp, 'MM/DD/YYYY HH:MI:SS.FF3') 'END_OF_OPTIMIZATION' from dual; diff --git a/queries/exasol/queries/create_schema.sql b/queries/exasol/queries/create_schema.sql new file mode 100644 index 0000000..07d83ac --- /dev/null +++ b/queries/exasol/queries/create_schema.sql @@ -0,0 +1,20 @@ +set autocommit off; +create schema if not exists tpc; +open schema tpc; + +-- keys dec(11) with exception of ORDERKEY dec(12) due to scaling requirements (Clause 1.3.1 - last comment) +-- integers dec(10) +-- decimals dec(12,2) + +create or replace table nation ( n_nationkey dec(11), n_name char(25) character set ascii, n_regionkey dec(11), n_comment varchar(152) character set ascii ); +create or replace table region ( r_regionkey dec(11), r_name char(25) character set ascii, r_comment varchar(152) character set ascii ); +create or replace table part ( p_partkey dec(11), p_name varchar(55) character set ascii, p_mfgr char(25) character set ascii, p_brand char(10) character set ascii, p_type varchar(25) character set ascii, p_size dec(10), p_container char(10) character set ascii, p_retailprice decimal(12,2), p_comment varchar(23) character set ascii, distribute by p_partkey ); +create or replace table supplier ( s_suppkey dec(11), s_name char(25) character set ascii, s_address varchar(40) character set ascii, s_nationkey dec(11), s_phone char(15) character set ascii, s_acctbal decimal(12,2), s_comment varchar(101) character set ascii, distribute by s_suppkey ); +create or replace table partsupp ( ps_partkey dec(11), ps_suppkey dec(11), ps_availqty dec(10), ps_supplycost decimal(12,2), ps_comment varchar(199) character set ascii, distribute by ps_partkey ); +create or replace table customer ( c_custkey dec(11), c_name varchar(25) character set ascii, c_address varchar(40) character set ascii, c_nationkey dec(11), c_phone char(15) character set ascii, c_acctbal decimal(12,2), c_mktsegment char(10) character set ascii, c_comment varchar(117) character set ascii, distribute by c_custkey); +create or replace table orders ( o_orderkey dec(12), o_custkey dec(11), o_orderstatus char(1) character set ascii, o_totalprice decimal(12,2), o_orderdate date, o_orderpriority char(15) character set ascii, o_clerk char(15) character set ascii, o_shippriority dec(10), o_comment varchar(79) character set ascii, distribute by o_custkey); +create or replace table lineitem ( l_orderkey dec(12), l_partkey dec(11), l_suppkey dec(11), l_linenumber dec(10), l_quantity decimal(12,2), l_extendedprice decimal(12,2), l_discount decimal(12,2), l_tax decimal(12,2), l_returnflag char(1) character set ascii, l_linestatus char(1) character set ascii, l_shipdate date, l_commitdate date, l_receiptdate date, l_shipinstruct char(25) character set ascii, l_shipmode char(10) character set ascii, l_comment varchar(44) character set ascii, distribute by l_orderkey ); + +commit; + +select to_char(current_timestamp, 'MM/DD/YYYY HH:MI:SS.FF3') 'END_OF_CREATE_SCHEMA' from dual; diff --git a/queries/exasol/queries/queries.txt b/queries/exasol/queries/queries.txt new file mode 100644 index 0000000..5c38c1c --- /dev/null +++ b/queries/exasol/queries/queries.txt @@ -0,0 +1,1531 @@ +============================== += Logfile for query stream 0 = +============================== + +-- using 925162709 as a seed to the RNG +OPEN SCHEMA tpc; + +affected rows: 0 + +-- current part timing: started at (epoch: 1750437112 s, 117014 usec), finished at (epoch: 1750437112 s, 158447 usec), time used: 0.04 s + +COMMIT; + +affected rows: 0 + +-- current part timing: started at (epoch: 1750437112 s, 158469 usec), finished at (epoch: 1750437112 s, 160051 usec), time used: 0.00 s + +******************** +* TPC-H Query 1 0 * +******************** + +-- $ID$ +-- TPC-H/TPC-R Pricing Summary Query (Q1) +-- Functional Query Definition +-- Approved February 1998 +-- TPC-H Query 1 0 + select + l_returnflag, + l_linestatus, + sum(l_quantity) as sum_qty, + sum(l_extendedprice) as sum_base_price, + sum(l_extendedprice * (1 - l_discount)) as sum_disc_price, + sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge, + avg(l_quantity) as avg_qty, + avg(l_extendedprice) as avg_price, + avg(l_discount) as avg_disc, + count(*) as count_order + from + {line_item_ds} + where + l_shipdate <= '1998-09-02' + group by + l_returnflag, + l_linestatus + order by + l_returnflag, + l_linestatus + ; + +******************** +* TPC-H Query 14 0 * +******************** + +-- $ID$ +-- TPC-H/TPC-R Promotion Effect Query (Q14) +-- Functional Query Definition +-- Approved February 1998 +-- TPC-H Query 14 0 + + + + + + select + round(100.00 * sum(case + when p_type like 'PROMO%' + then l_extendedprice * (1 - l_discount) + else 0 + end) / sum(l_extendedprice * (1 - l_discount)), 2) as promo_revenue + from + {line_item_ds}, + {part_ds} + where + l_partkey = p_partkey + and l_shipdate >= date '1995-09-01' + and l_shipdate < date '1995-09-01' + interval '1' month + ; + + +=============== + +resultset consists of 1 rows in 1 columns + +-- current part timing: started at (epoch: 1750437112 s, 160086 usec), finished at (epoch: 1750437113 s, 131363 usec), time used: 0.97 s + +COMMIT; + +affected rows: 0 + +-- current part timing: started at (epoch: 1750437113 s, 131375 usec), finished at (epoch: 1750437113 s, 149961 usec), time used: 0.02 s + + + +----------- +- Summary - +----------- + +started at: Fri Jun 20 18:31:52 2025 + (epoch: 1750437112 s, 160086 usec) +finished at: Fri Jun 20 18:31:53 2025 + (epoch: 1750437113 s, 149961 usec) +time used : 0.99 s + +******************* +* TPC-H Query 2 0 * +******************* + +-- $ID$ +-- TPC-H/TPC-R Minimum Cost Supplier Query (Q2) +-- Functional Query Definition +-- Approved February 1998 +-- Minor modification - result set limit ( 2.1.2.9.3 ) +-- TPC-H Query 2 0 + + + + + + select + s_acctbal, + s_name, + n_name, + p_partkey, + p_mfgr, + s_address, + s_phone, + s_comment + from + {part_ds}, + {supplier_ds}, + {part_supp_ds}, + {nation_ds}, + {region_ds} + where + p_partkey = ps_partkey + and s_suppkey = ps_suppkey + and p_size = 15 + and p_type like '%BRASS' + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = 'EUROPE' + and ps_supplycost = ( + select + min(ps_supplycost) + from + {part_supp_ds}, + {supplier_ds}, + {nation_ds}, + {region_ds} + where + p_partkey = ps_partkey + and s_suppkey = ps_suppkey + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = 'EUROPE' + ) + order by + s_acctbal desc, + n_name, + s_name, + p_partkey + limit 100 + ; + + +====================================================================== + +resultset consists of 100 rows in 8 columns + +-- current part timing: started at (epoch: 1750437113 s, 150921 usec), finished at (epoch: 1750437113 s, 627181 usec), time used: 0.48 s + +COMMIT; + +affected rows: 0 + +-- current part timing: started at (epoch: 1750437113 s, 627204 usec), finished at (epoch: 1750437113 s, 848167 usec), time used: 0.22 s + + + +----------- +- Summary - +----------- + +started at: Fri Jun 20 18:31:53 2025 + (epoch: 1750437113 s, 150921 usec) +finished at: Fri Jun 20 18:31:53 2025 + (epoch: 1750437113 s, 848167 usec) +time used : 0.70 s + +******************* +* TPC-H Query 9 0 * +******************* + +-- $ID$ +-- TPC-H/TPC-R Product Type Profit Measure Query (Q9) +-- Functional Query Definition +-- Approved February 1998 +-- TPC-H Query 9 0 + + + + + + select + nation, + o_year, + round(sum(amount), 2) as sum_profit + from + ( + select + n_name as nation, + year(o_orderdate) as o_year, + l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount + from + {part_ds}, + {supplier_ds}, + {line_item_ds}, + {part_supp_ds}, + {orders_ds}, + {nation_ds} + where + s_suppkey = l_suppkey + and ps_suppkey = l_suppkey + and ps_partkey = l_partkey + and p_partkey = l_partkey + and o_orderkey = l_orderkey + and s_nationkey = n_nationkey + and p_name like '%green%' + ) as profit + group by + nation, + o_year + order by + nation, + o_year desc + ; + + +========================== + +resultset consists of 175 rows in 3 columns + +-- current part timing: started at (epoch: 1750437113 s, 848240 usec), finished at (epoch: 1750437118 s, 491998 usec), time used: 4.64 s + +COMMIT; + +affected rows: 0 + +-- current part timing: started at (epoch: 1750437118 s, 492013 usec), finished at (epoch: 1750437118 s, 559360 usec), time used: 0.07 s + + + +----------- +- Summary - +----------- + +started at: Fri Jun 20 18:31:53 2025 + (epoch: 1750437113 s, 848240 usec) +finished at: Fri Jun 20 18:31:58 2025 + (epoch: 1750437118 s, 559360 usec) +time used : 4.71 s + +******************** +* TPC-H Query 20 0 * +******************** + +-- $ID$ +-- TPC-H/TPC-R Potential Part Promotion Query (Q20) +-- Function Query Definition +-- Approved February 1998 +-- TPC-H Query 20 0 + + + + + + select + s_name, + s_address + from + {supplier_ds}, + {nation_ds} + where + s_suppkey in ( + select + ps_suppkey + from + {part_supp_ds} + where + ps_partkey in ( + select + p_partkey + from + {part_ds} + where + p_name like 'forest%' + ) + and ps_availqty > ( + select + 0.5 * sum(l_quantity) + from + {line_item_ds} + where + l_partkey = ps_partkey + and l_suppkey = ps_suppkey + and l_shipdate >= date '1994-01-01' + and l_shipdate < date '1994-01-01' + interval '1' year + ) + ) + and s_nationkey = n_nationkey + and n_name = 'CANADA' + order by + s_name + ; + + +================== + +resultset consists of 225456 rows in 2 columns + +-- current part timing: started at (epoch: 1750437118 s, 560298 usec), finished at (epoch: 1750437120 s, 197444 usec), time used: 1.64 s + +COMMIT; + +affected rows: 0 + +-- current part timing: started at (epoch: 1750437120 s, 197465 usec), finished at (epoch: 1750437120 s, 224104 usec), time used: 0.03 s + + + +----------- +- Summary - +----------- + +started at: Fri Jun 20 18:31:58 2025 + (epoch: 1750437118 s, 560298 usec) +finished at: Fri Jun 20 18:32:00 2025 + (epoch: 1750437120 s, 224104 usec) +time used : 1.66 s + +******************* +* TPC-H Query 6 0 * +******************* + +-- $ID$ +-- TPC-H/TPC-R Forecasting Revenue Change Query (Q6) +-- Functional Query Definition +-- Approved February 1998 +-- TPC-H Query 6 0 + + + + + + select + sum(l_extendedprice * l_discount) as revenue + from + {line_item_ds} + where + l_shipdate >= timestamp '1994-01-01' + and l_shipdate < timestamp '1994-01-01' + interval '1' year + and l_discount between .06 - 0.01 and .06 + 0.01 + and l_quantity < 24 + ; + + +========= + +resultset consists of 1 rows in 1 columns + +-- current part timing: started at (epoch: 1750437120 s, 224157 usec), finished at (epoch: 1750437121 s, 65863 usec), time used: 0.84 s + +COMMIT; + +affected rows: 0 + +-- current part timing: started at (epoch: 1750437121 s, 65875 usec), finished at (epoch: 1750437121 s, 84345 usec), time used: 0.02 s + + + +----------- +- Summary - +----------- + +started at: Fri Jun 20 18:32:00 2025 + (epoch: 1750437120 s, 224157 usec) +finished at: Fri Jun 20 18:32:01 2025 + (epoch: 1750437121 s, 84345 usec) +time used : 0.86 s + +******************** +* TPC-H Query 17 0 * +******************** + +-- $ID$ +-- TPC-H/TPC-R Small-Quantity-Order Revenue Query (Q17) +-- Functional Query Definition +-- Approved February 1998 +-- TPC-H Query 17 0 + + + + + + select + round(sum(l_extendedprice) / 7.0, 2) as avg_yearly + from + {line_item_ds}, + {part_ds} + where + p_partkey = l_partkey + and p_brand = 'Brand#23' + and p_container = 'MED BOX' + and l_quantity < ( + select + 0.2 * avg(l_quantity) + from + {line_item_ds} + where + l_partkey = p_partkey + ) + ; + + +============ + +resultset consists of 1 rows in 1 columns + +-- current part timing: started at (epoch: 1750437121 s, 84405 usec), finished at (epoch: 1750437121 s, 271458 usec), time used: 0.19 s + +COMMIT; + +affected rows: 0 + +-- current part timing: started at (epoch: 1750437121 s, 271463 usec), finished at (epoch: 1750437121 s, 303083 usec), time used: 0.03 s + + + +----------- +- Summary - +----------- + +started at: Fri Jun 20 18:32:01 2025 + (epoch: 1750437121 s, 84405 usec) +finished at: Fri Jun 20 18:32:01 2025 + (epoch: 1750437121 s, 303083 usec) +time used : 0.22 s + +******************** +* TPC-H Query 18 0 * +******************** + +-- $ID$ +-- TPC-H/TPC-R Large Volume Customer Query (Q18) +-- Function Query Definition +-- Approved February 1998 +-- Minor modification - result set limit ( 2.1.2.9.3 ) +-- TPC-H Query 18 0 + + + + + + select + c_name, + c_custkey, + o_orderkey, + o_orderdate as o_orderdat, + o_totalprice, + sum(l_quantity) as col6 + from + {customer_ds}, + {orders_ds}, + {line_item_ds} + where + o_orderkey in ( + select + l_orderkey + from + {line_item_ds} + group by + l_orderkey having + sum(l_quantity) > 300 + ) + and c_custkey = o_custkey + and o_orderkey = l_orderkey + group by + c_name, + c_custkey, + o_orderkey, + o_orderdate, + o_totalprice + order by + o_totalprice desc, + o_orderdate + limit 100 + ; + + +=============================================================================== + +resultset consists of 100 rows in 6 columns + +-- current part timing: started at (epoch: 1750437121 s, 303141 usec), finished at (epoch: 1750437148 s, 891075 usec), time used: 27.59 s + +COMMIT; + +affected rows: 0 + +-- current part timing: started at (epoch: 1750437148 s, 891093 usec), finished at (epoch: 1750437148 s, 909931 usec), time used: 0.02 s + + + +----------- +- Summary - +----------- + +started at: Fri Jun 20 18:32:01 2025 + (epoch: 1750437121 s, 303141 usec) +finished at: Fri Jun 20 18:32:28 2025 + (epoch: 1750437148 s, 909931 usec) +time used : 27.61 s + +******************* +* TPC-H Query 8 0 * +******************* + +-- $ID$ +-- TPC-H/TPC-R National Market Share Query (Q8) +-- Functional Query Definition +-- Approved February 1998 +-- TPC-H Query 8 0 + + + + + + select + o_year, + round( + sum(case + when nation = 'BRAZIL' then volume + else 0 + end) / sum(volume) + , 2) as mkt_share + from + ( + select + extract(year from o_orderdate) as o_year, + l_extendedprice * (1 - l_discount) as volume, + n2.n_name as nation + from + {part_ds}, + {supplier_ds}, + {line_item_ds}, + {orders_ds}, + {customer_ds}, + {nation_ds} n1, + {nation_ds} n2, + {region_ds} + where + p_partkey = l_partkey + and s_suppkey = l_suppkey + and l_orderkey = o_orderkey + and o_custkey = c_custkey + and c_nationkey = n1.n_nationkey + and n1.n_regionkey = r_regionkey + and r_name = 'AMERICA' + and s_nationkey = n2.n_nationkey + and o_orderdate between timestamp '1995-01-01' and timestamp '1996-12-31' + and p_type = 'ECONOMY ANODIZED STEEL' + ) as all_nations + group by + o_year + order by + o_year + ; + + +================== + +resultset consists of 2 rows in 2 columns + +-- current part timing: started at (epoch: 1750437148 s, 910732 usec), finished at (epoch: 1750437149 s, 325239 usec), time used: 0.41 s + +COMMIT; + +affected rows: 0 + +-- current part timing: started at (epoch: 1750437149 s, 325247 usec), finished at (epoch: 1750437149 s, 377605 usec), time used: 0.05 s + + + +----------- +- Summary - +----------- + +started at: Fri Jun 20 18:32:28 2025 + (epoch: 1750437148 s, 910732 usec) +finished at: Fri Jun 20 18:32:29 2025 + (epoch: 1750437149 s, 377605 usec) +time used : 0.47 s + +******************** +* TPC-H Query 21 0 * +******************** + +-- $ID$ +-- TPC-H/TPC-R Suppliers Who Kept Orders Waiting Query (Q21) +-- Functional Query Definition +-- Approved February 1998 +-- Minor modification - result set limit ( 2.1.2.9.3 ) +-- TPC-H Query 21 0 + + + + + + select + s_name, + count(*) as numwait + from + {supplier_ds}, + {line_item_ds} l1, + {orders_ds}, + {nation_ds} + where + s_suppkey = l1.l_suppkey + and o_orderkey = l1.l_orderkey + and o_orderstatus = 'F' + and l1.l_receiptdate > l1.l_commitdate + and exists ( + select + * + from + {line_item_ds} l2 + where + l2.l_orderkey = l1.l_orderkey + and l2.l_suppkey <> l1.l_suppkey + ) + and not exists ( + select + * + from + {line_item_ds} l3 + where + l3.l_orderkey = l1.l_orderkey + and l3.l_suppkey <> l1.l_suppkey + and l3.l_receiptdate > l3.l_commitdate + ) + and s_nationkey = n_nationkey + and n_name = 'SAUDI ARABIA' + group by + s_name + order by + numwait desc, + s_name + limit 100 + ; + + +================ + +resultset consists of 100 rows in 2 columns + +-- current part timing: started at (epoch: 1750437149 s, 377693 usec), finished at (epoch: 1750437151 s, 792788 usec), time used: 2.42 s + +COMMIT; + +affected rows: 0 + +-- current part timing: started at (epoch: 1750437151 s, 792806 usec), finished at (epoch: 1750437151 s, 830652 usec), time used: 0.04 s + + + +----------- +- Summary - +----------- + +started at: Fri Jun 20 18:32:29 2025 + (epoch: 1750437149 s, 377693 usec) +finished at: Fri Jun 20 18:32:31 2025 + (epoch: 1750437151 s, 830652 usec) +time used : 2.45 s + +******************** +* TPC-H Query 13 0 * +******************** + +-- $ID$ +-- TPC-H/TPC-R Customer Distribution Query (Q13) +-- Functional Query Definition +-- Approved February 1998 +-- Minor modification - Naming of the columns of the sub select - different syntax for select-list AS clause ( 2.2.3.3 b) ) +-- TPC-H Query 13 0 + + + + + + select + c_count, count(*) as custdist + from ( + select + c_custkey, + count(o_orderkey) + from + {customer_ds} left outer join {orders_ds} on + c_custkey = o_custkey + and o_comment not like '%special%requests%' + group by + c_custkey + )as c_orders (c_custkey, c_count) + group by + c_count + order by + custdist desc, + c_count desc + ; + + +================== + +resultset consists of 28 rows in 2 columns + +-- current part timing: started at (epoch: 1750437151 s, 830724 usec), finished at (epoch: 1750437159 s, 548532 usec), time used: 7.72 s + +COMMIT; + +affected rows: 0 + +-- current part timing: started at (epoch: 1750437159 s, 548550 usec), finished at (epoch: 1750437159 s, 582539 usec), time used: 0.03 s + + + +----------- +- Summary - +----------- + +started at: Fri Jun 20 18:32:31 2025 + (epoch: 1750437151 s, 830724 usec) +finished at: Fri Jun 20 18:32:39 2025 + (epoch: 1750437159 s, 582539 usec) +time used : 7.75 s + +******************* +* TPC-H Query 3 0 * +******************* + +-- $ID$ +-- TPC-H/TPC-R Shipping Priority Query (Q3) +-- Functional Query Definition +-- Approved February 1998 +-- Minor modification - result set limit ( 2.1.2.9.3 ) +-- TPC-H Query 3 0 + + + + + + select + l_orderkey, + sum(l_extendedprice * (1 - l_discount)) as revenue, + o_orderdate, + o_shippriority + from + {customer_ds}, + {orders_ds}, + {line_item_ds} + where + c_mktsegment = 'BUILDING' + and c_custkey = o_custkey + and l_orderkey = o_orderkey + and o_orderdate < '1995-03-15' + and l_shipdate > '1995-03-15' + group by + l_orderkey, + o_orderdate, + o_shippriority + order by + revenue desc, + o_orderdate + limit 10 + ; + + +=============================================== + +resultset consists of 10 rows in 4 columns + +-- current part timing: started at (epoch: 1750437159 s, 583518 usec), finished at (epoch: 1750437161 s, 963666 usec), time used: 2.38 s + +COMMIT; + +affected rows: 0 + +-- current part timing: started at (epoch: 1750437161 s, 963680 usec), finished at (epoch: 1750437162 s, 10920 usec), time used: 0.05 s + + + +----------- +- Summary - +----------- + +started at: Fri Jun 20 18:32:39 2025 + (epoch: 1750437159 s, 583518 usec) +finished at: Fri Jun 20 18:32:42 2025 + (epoch: 1750437162 s, 10920 usec) +time used : 2.43 s + +******************** +* TPC-H Query 22 0 * +******************** + +-- $ID$ +-- TPC-H/TPC-R Global Sales Opportunity Query (Q22) +-- Functional Query Definition +-- Approved February 1998 +-- TPC-H Query 22 0 + + + + + + select + cntrycode, + count(*) as numcust, + sum(c_acctbal) as totacctbal + from ( + select + substring(c_phone from 1 for 2) as cntrycode, + c_acctbal + from + {customer_ds} + where + substring(c_phone from 1 for 2) in + (13, 31, 23, 29, 30, 18, 17) + and c_acctbal > ( + select + avg(c_acctbal) + from + {customer_ds} + where + c_acctbal > 0.00 + and substring (c_phone from 1 for 2) in + (13, 31, 23, 29, 30, 18, 17) + ) + and not exists ( + select + * + from + {orders_ds} + where + o_custkey = c_custkey + ) + ) as custsale + group by + cntrycode + order by + cntrycode + ; + + +============================== + +resultset consists of 7 rows in 3 columns + +-- current part timing: started at (epoch: 1750437162 s, 10994 usec), finished at (epoch: 1750437162 s, 532905 usec), time used: 0.52 s + +COMMIT; + +affected rows: 0 + +-- current part timing: started at (epoch: 1750437162 s, 532917 usec), finished at (epoch: 1750437162 s, 541486 usec), time used: 0.01 s + + + +----------- +- Summary - +----------- + +started at: Fri Jun 20 18:32:42 2025 + (epoch: 1750437162 s, 10994 usec) +finished at: Fri Jun 20 18:32:42 2025 + (epoch: 1750437162 s, 541486 usec) +time used : 0.53 s + +******************** +* TPC-H Query 16 0 * +******************** + +-- $ID$ +-- TPC-H/TPC-R Parts/Supplier Relationship Query (Q16) +-- Functional Query Definition +-- Approved February 1998 +-- TPC-H Query 16 0 + + + + + + select + p_brand, + p_type, + p_size, + count(distinct ps_suppkey) as supplier_cnt + from + {part_supp_ds}, + {part_ds} + where + p_partkey = ps_partkey + and p_brand <> 'Brand#45' + and p_type not like 'MEDIUM POLISHED%' + and p_size in (49, 14, 23, 45, 19, 3, 36, 9) + and ps_suppkey not in ( + select + s_suppkey + from + {supplier_ds} + where + s_comment like '%Customer%Complaints%' + ) + group by + p_brand, + p_type, + p_size + order by + supplier_cnt desc, + p_brand, + p_type, + p_size + ; + + +==================================== + +resultset consists of 27840 rows in 4 columns + +-- current part timing: started at (epoch: 1750437162 s, 542600 usec), finished at (epoch: 1750437167 s, 398568 usec), time used: 4.86 s + +COMMIT; + +affected rows: 0 + +-- current part timing: started at (epoch: 1750437167 s, 398588 usec), finished at (epoch: 1750437167 s, 404365 usec), time used: 0.01 s + + + +----------- +- Summary - +----------- + +started at: Fri Jun 20 18:32:42 2025 + (epoch: 1750437162 s, 542600 usec) +finished at: Fri Jun 20 18:32:47 2025 + (epoch: 1750437167 s, 404365 usec) +time used : 4.86 s + +******************* +* TPC-H Query 4 0 * +******************* + +-- $ID$ +-- TPC-H/TPC-R Order Priority Checking Query (Q4) +-- Functional Query Definition +-- Approved February 1998 +-- TPC-H Query 4 0 + + + + + + select + o_orderpriority, + count(*) as order_count + from + {orders_ds} + where + o_orderdate >= timestamp '1993-07-01' + and o_orderdate < timestamp '1993-07-01' + interval '3' month + and exists ( + select + * + from + {line_item_ds} + where + l_orderkey = o_orderkey + and l_commitdate < l_receiptdate + ) + group by + o_orderpriority + order by + o_orderpriority + ; + + +============================= + +resultset consists of 5 rows in 2 columns + +-- current part timing: started at (epoch: 1750437167 s, 404427 usec), finished at (epoch: 1750437167 s, 879949 usec), time used: 0.48 s + +COMMIT; + +affected rows: 0 + +-- current part timing: started at (epoch: 1750437167 s, 879970 usec), finished at (epoch: 1750437167 s, 891741 usec), time used: 0.01 s + + + +----------- +- Summary - +----------- + +started at: Fri Jun 20 18:32:47 2025 + (epoch: 1750437167 s, 404427 usec) +finished at: Fri Jun 20 18:32:47 2025 + (epoch: 1750437167 s, 891741 usec) +time used : 0.49 s + +******************** +* TPC-H Query 11 0 * +******************** + +-- $ID$ +-- TPC-H/TPC-R Important Stock Identification Query (Q11) +-- Functional Query Definition +-- Approved February 1998 +-- Minor modification - Quoting keyword value ( 2.2.3.3 k) ) +-- TPC-H Query 11 0 + + + + + + select + ps_partkey, + round(sum(ps_supplycost * ps_availqty), 2) as value + from + {part_supp_ds}, + {supplier_ds}, + {nation_ds} + where + ps_suppkey = s_suppkey + and s_nationkey = n_nationkey + and n_name = 'GERMANY' + group by + ps_partkey having + sum(ps_supplycost * ps_availqty) > ( + select + sum(ps_supplycost * ps_availqty) * 0.0001 + from + {part_supp_ds}, + {supplier_ds}, + {nation_ds} + where + ps_suppkey = s_suppkey + and s_nationkey = n_nationkey + and n_name = 'GERMANY' + ) + order by + value desc + ; + + +================== + +resultset consists of 2849331 rows in 2 columns + +-- current part timing: started at (epoch: 1750437167 s, 892636 usec), finished at (epoch: 1750437171 s, 903481 usec), time used: 4.01 s + +COMMIT; + +affected rows: 0 + +-- current part timing: started at (epoch: 1750437171 s, 903495 usec), finished at (epoch: 1750437171 s, 908493 usec), time used: 0.00 s + + + +----------- +- Summary - +----------- + +started at: Fri Jun 20 18:32:47 2025 + (epoch: 1750437167 s, 892636 usec) +finished at: Fri Jun 20 18:32:51 2025 + (epoch: 1750437171 s, 908493 usec) +time used : 4.02 s + +******************** +* TPC-H Query 15 0 * +******************** + +-- $ID$ +-- TPC-H/TPC-R Top Supplier Query (Q15) +-- Functional Query Definition +-- Approved February 1998 +-- Using approved Variant A of Q15 (Appendix B) - 'with clause' instead of 'create view/drop view' +-- Variant allowed because of 2.2.4.2 and 2.2.4.3 +-- TPC-H Query 15 0 + + + +with revenue0 (supplier_no, total_revenue) as +( + + select + s_suppkey, + s_name, + s_address, + s_phone, + total_revenue + from + {supplier_ds}, + revenue + where + s_suppkey = supplier_no + and total_revenue = ( + select + max(total_revenue) + from + revenue + ) + order by + s_suppkey + ; + + +=================================================================================================================== + +resultset consists of 4 rows in 10 columns + +-- current part timing: started at (epoch: 1750437178 s, 690669 usec), finished at (epoch: 1750437182 s, 549689 usec), time used: 3.86 s + +COMMIT; + +affected rows: 0 + +-- current part timing: started at (epoch: 1750437182 s, 549695 usec), finished at (epoch: 1750437182 s, 577175 usec), time used: 0.03 s + + + +----------- +- Summary - +----------- + +started at: Fri Jun 20 18:32:58 2025 + (epoch: 1750437178 s, 690669 usec) +finished at: Fri Jun 20 18:33:02 2025 + (epoch: 1750437182 s, 577175 usec) +time used : 3.89 s + +******************** +* TPC-H Query 10 0 * +******************** + +-- $ID$ +-- TPC-H/TPC-R Returned Item Reporting Query (Q10) +-- Functional Query Definition +-- Approved February 1998 +-- Minor modification - result set limit ( 2.1.2.9.3 ) +-- TPC-H Query 10 0 + + + + + + select + c_custkey, + c_name, + round(sum(l_extendedprice * (1 - l_discount)), 2) as revenue, + c_acctbal, + n_name, + c_address, + c_phone, + c_comment + from + {customer_ds}, + {orders_ds}, + {line_item_ds}, + {nation_ds} + where + c_custkey = o_custkey + and l_orderkey = o_orderkey + and o_orderdate >= date '1993-10-01' + and o_orderdate < date '1993-10-01' + interval '3' month + and l_returnflag = 'R' + and c_nationkey = n_nationkey + group by + c_custkey, + c_name, + c_acctbal, + c_phone, + n_name, + c_address, + c_comment + order by + revenue desc + limit 20 + ; + + +======================================================================= + +resultset consists of 20 rows in 8 columns + +-- current part timing: started at (epoch: 1750437182 s, 577940 usec), finished at (epoch: 1750437188 s, 893659 usec), time used: 6.32 s + +COMMIT; + +affected rows: 0 + +-- current part timing: started at (epoch: 1750437188 s, 893673 usec), finished at (epoch: 1750437188 s, 962393 usec), time used: 0.07 s + + + +----------- +- Summary - +----------- + +started at: Fri Jun 20 18:33:02 2025 + (epoch: 1750437182 s, 577940 usec) +finished at: Fri Jun 20 18:33:08 2025 + (epoch: 1750437188 s, 962393 usec) +time used : 6.38 s + +******************** +* TPC-H Query 19 0 * +******************** + +-- $ID$ +-- TPC-H/TPC-R Discounted Revenue Query (Q19) +-- Functional Query Definition +-- Approved February 1998 +-- TPC-H Query 19 0 + + + + + + select + round(sum(l_extendedprice* (1 - l_discount)), 2) as revenue + from + {line_item_ds}, + {part_ds} + where + ( + p_partkey = l_partkey + and p_brand = 'Brand#12' + and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') + and l_quantity >= 1 and l_quantity <= 1 + 10 + and p_size between 1 and 5 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ) + or + ( + p_partkey = l_partkey + and p_brand = 'Brand#23' + and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK') + and l_quantity >= 10 and l_quantity <= 20 + and p_size between 1 and 10 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ) + or + ( + p_partkey = l_partkey + and p_brand = 'Brand#34' + and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG') + and l_quantity >= 20 and l_quantity <= 30 + and p_size between 1 and 15 + and l_shipmode in ('AIR', 'AIR REG') + and l_shipinstruct = 'DELIVER IN PERSON' + ) + ; + + +========= + +resultset consists of 1 rows in 1 columns + +-- current part timing: started at (epoch: 1750437188 s, 963356 usec), finished at (epoch: 1750437189 s, 143713 usec), time used: 0.18 s + +COMMIT; + +affected rows: 0 + +-- current part timing: started at (epoch: 1750437189 s, 143726 usec), finished at (epoch: 1750437189 s, 220857 usec), time used: 0.08 s + + + +----------- +- Summary - +----------- + +started at: Fri Jun 20 18:33:08 2025 + (epoch: 1750437188 s, 963356 usec) +finished at: Fri Jun 20 18:33:09 2025 + (epoch: 1750437189 s, 220857 usec) +time used : 0.26 s + +******************* +* TPC-H Query 5 0 * +******************* + +-- $ID$ +-- TPC-H/TPC-R Local Supplier Volume Query (Q5) +-- Functional Query Definition +-- Approved February 1998 +-- TPC-H Query 5 0 + + + + + + select + n_name, + sum(l_extendedprice * (1 - l_discount)) as revenue + from + {customer_ds}, + {orders_ds}, + {line_item_ds}, + {supplier_ds}, + {nation_ds}, + {region_ds} + where + c_custkey = o_custkey + and l_orderkey = o_orderkey + and l_suppkey = s_suppkey + and c_nationkey = s_nationkey + and s_nationkey = n_nationkey + and n_regionkey = r_regionkey + and r_name = 'ASIA' + and o_orderdate >= timestamp '1994-01-01' + and o_orderdate < timestamp '1994-01-01' + interval '1' year + group by + n_name + order by + revenue desc + ; + + +================ + +resultset consists of 5 rows in 2 columns + +-- current part timing: started at (epoch: 1750437189 s, 220929 usec), finished at (epoch: 1750437190 s, 801745 usec), time used: 1.58 s + +COMMIT; + +affected rows: 0 + +-- current part timing: started at (epoch: 1750437190 s, 801762 usec), finished at (epoch: 1750437190 s, 877866 usec), time used: 0.08 s + + + +----------- +- Summary - +----------- + +started at: Fri Jun 20 18:33:09 2025 + (epoch: 1750437189 s, 220929 usec) +finished at: Fri Jun 20 18:33:10 2025 + (epoch: 1750437190 s, 877866 usec) +time used : 1.66 s + +******************* +* TPC-H Query 7 0 * +******************* + +-- $ID$ +-- TPC-H/TPC-R Volume Shipping Query (Q7) +-- Functional Query Definition +-- Approved February 1998 +-- TPC-H Query 7 0 + + + + + + select + supp_nation, + cust_nation, + l_year, + sum(volume) as revenue + from + ( + select + n1.n_name as supp_nation, + n2.n_name as cust_nation, + year(l_shipdate) as l_year, + l_extendedprice * (1 - l_discount) as volume + from + {supplier_ds}, + {line_item_ds}, + {orders_ds}, + {customer_ds}, + {nation_ds} n1, + {nation_ds} n2 + where + s_suppkey = l_suppkey + and o_orderkey = l_orderkey + and c_custkey = o_custkey + and s_nationkey = n1.n_nationkey + and c_nationkey = n2.n_nationkey + and ( + (n1.n_name = 'FRANCE' and n2.n_name = 'GERMANY') + or (n1.n_name = 'GERMANY' and n2.n_name = 'FRANCE') + ) + and l_shipdate between timestamp '1995-01-01' and timestamp '1996-12-31' + ) as shipping + group by + supp_nation, + cust_nation, + l_year + order by + supp_nation, + cust_nation, + l_year + ; + + +======================================== + +resultset consists of 4 rows in 4 columns + +-- current part timing: started at (epoch: 1750437190 s, 877952 usec), finished at (epoch: 1750437192 s, 917013 usec), time used: 2.04 s + +COMMIT; + +affected rows: 0 + +-- current part timing: started at (epoch: 1750437192 s, 917028 usec), finished at (epoch: 1750437192 s, 983548 usec), time used: 0.07 s + + + +----------- +- Summary - +----------- + +started at: Fri Jun 20 18:33:10 2025 + (epoch: 1750437190 s, 877952 usec) +finished at: Fri Jun 20 18:33:12 2025 + (epoch: 1750437192 s, 983548 usec) +time used : 2.11 s + +******************** +* TPC-H Query 12 0 * +******************** + +-- $ID$ +-- TPC-H/TPC-R Shipping Modes and Order Priority Query (Q12) +-- Functional Query Definition +-- Approved February 1998 +-- TPC-H Query 12 0 + + + + + + select + l_shipmode, + sum(case + when o_orderpriority = '1-URGENT' + or o_orderpriority = '2-HIGH' + then 1 + else 0 + end) as high_line_count, + sum(case + when o_orderpriority <> '1-URGENT' + and o_orderpriority <> '2-HIGH' + then 1 + else 0 + end) as low_line_count + from + {orders_ds}, + {line_item_ds} + where + o_orderkey = l_orderkey + and l_shipmode in ('MAIL', 'SHIP') + and l_commitdate < l_receiptdate + and l_shipdate < l_commitdate + and l_receiptdate >= date '1994-01-01' + and l_receiptdate < date '1994-01-01' + interval '1' year + group by + l_shipmode + order by + l_shipmode + ; + + +=========================================== + +resultset consists of 2 rows in 3 columns + +-- current part timing: started at (epoch: 1750437192 s, 984520 usec), finished at (epoch: 1750437194 s, 33122 usec), time used: 1.05 s + +COMMIT; + +affected rows: 0 + +-- current part timing: started at (epoch: 1750437194 s, 33140 usec), finished at (epoch: 1750437194 s, 60886 usec), time used: 0.03 s + + + + +----------- +- Summary - +----------- + +started at: Fri Jun 20 18:33:12 2025 + (epoch: 1750437192 s, 984520 usec) +finished at: Fri Jun 20 18:33:14 2025 + (epoch: 1750437194 s, 60886 usec) +time used : 1.08 s + +************* +* TPC-H End * +************* + +--------------------------- +- Summary of querystream 0 - +--------------------------- + +started at: Fri Jun 20 18:31:52 2025 + (epoch: 1750437112 s, 81911 usec) +finished at: Fri Jun 20 18:33:14 2025 + (epoch: 1750437194 s, 62087 usec) +time used : 81.98 s + diff --git a/queries/exasol/utils.py b/queries/exasol/utils.py new file mode 100644 index 0000000..9c1ba8b --- /dev/null +++ b/queries/exasol/utils.py @@ -0,0 +1,325 @@ +from __future__ import annotations + +import time +from contextlib import suppress +from glob import glob +from pathlib import Path + +import pandas as _pd +import pyarrow.parquet as pq +import pyexasol + +from queries.common_utils import run_query_generic +from settings import Settings + +settings = Settings() + +_connection: pyexasol.ExaConnection | None = None + + +def get_connection() -> pyexasol.ExaConnection: + global _connection + if _connection is None: + dsn = f"{settings.exasol.host}:{settings.exasol.port}" + schema = settings.exasol.schema_name or settings.exasol.user + _connection = pyexasol.connect( + dsn=dsn, + user=settings.exasol.user, + password=settings.exasol.password, + schema=schema, + ) + return _connection + + +def get_line_item_ds() -> str: + return "lineitem" + + +def get_orders_ds() -> str: + return "orders" + + +def get_customer_ds() -> str: + return "customer" + + +def get_region_ds() -> str: + return "region" + + +def get_nation_ds() -> str: + return "nation" + + +def get_supplier_ds() -> str: + return "supplier" + + +def get_part_ds() -> str: + return "part" + + +def get_part_supp_ds() -> str: + return "partsupp" + + +def get_db_library_version() -> str: + """Get the Exasol database version via SQL.""" + conn = get_connection() + row = conn.execute( + "SELECT PARAM_VALUE FROM SYS.EXA_METADATA " + "WHERE PARAM_NAME='databaseProductVersion'" + ).fetchall()[0] + return row[0] + + +_TABLE_NAMES = [ + "nation", + "region", + "part", + "supplier", + "partsupp", + "customer", + "orders", + "lineitem", +] + +# Mapping of TPC-H table to its columns for CSV import (ignore trailing empty field) +_TABLE_COLUMNS: dict[str, list[str]] = { + "nation": ["n_nationkey", "n_name", "n_regionkey", "n_comment"], + "region": ["r_regionkey", "r_name", "r_comment"], + "part": [ + "p_partkey", + "p_name", + "p_mfgr", + "p_brand", + "p_type", + "p_size", + "p_container", + "p_retailprice", + "p_comment", + ], + "supplier": [ + "s_suppkey", + "s_name", + "s_address", + "s_nationkey", + "s_phone", + "s_acctbal", + "s_comment", + ], + "partsupp": [ + "ps_partkey", + "ps_suppkey", + "ps_availqty", + "ps_supplycost", + "ps_comment", + ], + "customer": [ + "c_custkey", + "c_name", + "c_address", + "c_nationkey", + "c_phone", + "c_acctbal", + "c_mktsegment", + "c_comment", + ], + "orders": [ + "o_orderkey", + "o_custkey", + "o_orderstatus", + "o_totalprice", + "o_orderdate", + "o_orderpriority", + "o_clerk", + "o_shippriority", + "o_comment", + ], + "lineitem": [ + "l_orderkey", + "l_partkey", + "l_suppkey", + "l_linenumber", + "l_quantity", + "l_extendedprice", + "l_discount", + "l_tax", + "l_returnflag", + "l_linestatus", + "l_shipdate", + "l_commitdate", + "l_receiptdate", + "l_shipinstruct", + "l_shipmode", + "l_comment", + ], +} + + +# ---------------------------------------------------------------------------- +def prepare_schema() -> None: + """Run create_schema.sql to create Exasol tables before loading data.""" + conn = get_connection() + scripts_dir = Path(__file__).parent / "queries" + sql = (scripts_dir / "create_schema.sql").read_text() + for stmt in sql.split(";"): + stmt_str = stmt.strip() + if not stmt_str: + continue + if stmt_str.lower().startswith("set autocommit"): + continue + conn.execute(stmt_str) + + +# ---------------------------------------------------------------------------- +def prepare_postload() -> None: + """Run post-load SQL to index tables and gather statistics.""" + conn = get_connection() + scripts_dir = Path(__file__).parent / "queries" + for script in ("create_indices_1node.sql", "analyze_database.sql"): + sql = (scripts_dir / script).read_text() + for stmt in sql.split(";"): + stmt_str = stmt.strip() + if not stmt_str: + continue + if stmt_str.lower().startswith("set autocommit"): + continue + conn.execute(stmt_str) + + +def prepare_load_data() -> None: + """Load TPC-H data files into Exasol tables using IMPORT FROM files. + + Verify row counts. + """ + conn = get_connection() + base_dir = settings.dataset_base_dir + overall_start = time.perf_counter() + for table in _TABLE_NAMES: + table_start = time.perf_counter() + # compute expected row count from parquet metadata if present + expected_count = 0 + for pq_file in sorted(Path(base_dir).glob(f"{table}.parquet*")): + expected_count += pq.ParquetFile(str(pq_file)).metadata.num_rows + + cols = _TABLE_COLUMNS[table] + csv_cols = [f"1..{len(cols)}"] + pattern = str(base_dir / f"{table}.tbl*") + for file_path in sorted(glob(pattern)): # noqa: PTH207 + conn.import_from_file( + file_path, + table=table, + import_params={ + "columns": cols, + "csv_cols": csv_cols, + "column_separator": "|", + }, + ) + + row = conn.execute(f"SELECT COUNT(*) FROM {table}").fetchall()[0] + count = row[0] + if count == 0: + msg = f"No rows loaded into Exasol table '{table}'" + raise RuntimeError(msg) + if expected_count and count != expected_count: + msg = ( + f"Row count mismatch for '{table}': loaded {count:,} rows " + f"but expected {expected_count:,}" + ) + raise RuntimeError(msg) + + elapsed = time.perf_counter() - table_start + if expected_count: + print( + f"{table:>10}: loaded {count:,}/{expected_count:,} rows in {elapsed:.2f}s", + flush=True, + ) + else: + print(f"{table:>10}: loaded {count:,} rows in {elapsed:.2f}s", flush=True) + + total_elapsed = time.perf_counter() - overall_start + print(f"Total load time: {total_elapsed:.2f}s", flush=True) + + +def _check_query_result_exasol(result: _pd.DataFrame, query_number: int) -> None: + """Assert that the given pandas DataFrame matches the expected answer. + + Apply Exasol-specific normalization (case, whitespace, types, dates). + """ + import warnings + + import pandas as pd + from pandas.testing import assert_frame_equal + + from queries.common_utils import _get_query_answer_pd + + expected = _get_query_answer_pd(query_number) + # detect which columns are string/extension dtype in the expected answers + string_cols = [ + col.lower() + for col in expected.columns + if pd.api.types.is_string_dtype(expected[col]) + or pd.api.types.is_object_dtype(expected[col]) + ] + # normalize column names to lowercase for comparison + got = result.reset_index(drop=True).copy() + got.columns = [c.lower() for c in got.columns] + exp = expected.copy() + exp.columns = [c.lower() for c in exp.columns] + for col in string_cols: + if col in got.columns and col in exp.columns: + got[col] = got[col].astype(str).str.strip() + exp[col] = exp[col].astype(str).str.strip() + for col in exp.columns: + with suppress(Exception): + exp[col] = exp[col].to_numpy() + for col in set(got.columns).intersection(exp.columns): + try: + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", message="Could not infer format.*", category=UserWarning + ) + got[col] = pd.to_datetime(got[col]).dt.strftime("%Y-%m-%d") + exp[col] = pd.to_datetime(exp[col]).dt.strftime("%Y-%m-%d") + except Exception: + pass + + assert_frame_equal(got, exp, check_dtype=False) + + +def run_query(query_number: int, query: str) -> None: + conn = get_connection() + + if not (settings.run.show_results or settings.run.check_results): + + def execute() -> None: + cursor = conn.execute(query) + with suppress(Exception): + cursor.fetchall() + return None + else: + + def execute() -> _pd.DataFrame: + cursor = conn.execute(query) + rows = cursor.fetchall() + cols = cursor.column_names() + df = _pd.DataFrame(rows, columns=cols) + # Round DECIMAL columns to their defined scale + for name, dtype in cursor.columns().items(): + scale = None + if isinstance(dtype, dict): + if dtype.get("type", "").upper() == "DECIMAL": + scale = dtype.get("scale") + if scale is not None: + vals = df[name].astype(float) + df[name] = vals.round(scale) + return df + + run_query_generic( + execute, + query_number, + "exasol", + library_version=get_db_library_version(), + query_checker=_check_query_result_exasol, + ) diff --git a/requirements-dev.txt b/requirements-dev.txt index a29ae3d..e3245b0 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,16 +1,20 @@ # This file was autogenerated by uv via the following command: # uv pip compile requirements-dev.in -mypy==1.15.0 +mypy==1.16.1 # via -r requirements-dev.in mypy-extensions==1.1.0 # via mypy numpy==2.2.6 # via pandas-stubs -pandas-stubs==2.2.3.250308 +pandas-stubs==2.3.0.250703 # via -r requirements-dev.in -ruff==0.11.11 +pathspec==0.12.1 + # via mypy +ruff==0.12.2 # via -r requirements-dev.in +tomli==2.2.1 + # via mypy types-pytz==2025.2.0.20250516 # via pandas-stubs -typing-extensions==4.13.2 +typing-extensions==4.14.1 # via mypy diff --git a/requirements.in b/requirements.in index 9dfed50..3afedd6 100644 --- a/requirements.in +++ b/requirements.in @@ -6,6 +6,7 @@ modin[ray] pandas>=2.0 polars polars_cloud +pyexasol # Required for Exasol queries pyspark pyarrow # Required by duckdb/pandas diff --git a/requirements.txt b/requirements.txt index 0da07a8..456b529 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,8 +6,10 @@ attrs==25.3.0 # via # jsonschema # referencing -certifi==2025.4.26 +certifi==2025.6.15 # via requests +cffi==1.17.1 + # via cryptography charset-normalizer==3.4.2 # via requests click==8.2.1 @@ -20,6 +22,8 @@ contourpy==1.3.2 # via matplotlib cramjam==2.10.0 # via fastparquet +cryptography==45.0.5 + # via pyopenssl cycler==0.12.1 # via matplotlib dask==2025.5.1 @@ -28,13 +32,13 @@ dask==2025.5.1 # dask-expr dask-expr==2.0.0 # via -r requirements.in -duckdb==1.3.0 +duckdb==1.3.1 # via -r requirements.in fastparquet==2024.11.0 # via -r requirements.in filelock==3.18.0 # via ray -fonttools==4.58.0 +fonttools==4.58.5 # via matplotlib fsspec==2025.5.1 # via @@ -43,7 +47,9 @@ fsspec==2025.5.1 # modin idna==3.10 # via requests -jsonschema==4.23.0 +importlib-metadata==8.7.0 + # via dask +jsonschema==4.24.0 # via ray jsonschema-specifications==2025.4.1 # via jsonschema @@ -57,11 +63,11 @@ matplotlib==3.10.3 # via plotnine mizani==0.13.5 # via plotnine -modin==0.32.0 +modin==0.33.2 # via -r requirements.in -msgpack==1.1.0 +msgpack==1.1.1 # via ray -narwhals==1.40.0 +narwhals==1.46.0 # via plotly numpy==2.2.6 # via @@ -83,6 +89,7 @@ packaging==25.0 # matplotlib # modin # plotly + # pyexasol # ray # statsmodels pandas==2.2.3 @@ -98,19 +105,19 @@ partd==1.4.2 # via dask patsy==1.0.1 # via statsmodels -pillow==11.2.1 +pillow==11.3.0 # via matplotlib -plotly==6.1.1 +plotly==6.2.0 # via -r requirements.in -plotnine==0.14.5 +plotnine==0.14.6 # via -r requirements.in -polars==1.30.0 +polars==1.31.0 # via # -r requirements.in # polars-cloud -polars-cloud==0.0.9 +polars-cloud==0.0.10 # via -r requirements.in -protobuf==6.31.0 +protobuf==6.31.1 # via ray psutil==7.0.0 # via modin @@ -121,14 +128,22 @@ pyarrow==20.0.0 # -r requirements.in # dask # modin -pydantic==2.11.5 +pyasn1==0.6.1 + # via rsa +pycparser==2.22 + # via cffi +pydantic==2.11.7 # via # -r requirements.in # pydantic-settings pydantic-core==2.33.2 # via pydantic -pydantic-settings==2.9.1 +pydantic-settings==2.10.1 + # via -r requirements.in +pyexasol==0.27.0 # via -r requirements.in +pyopenssl==25.1.0 + # via pyexasol pyparsing==3.2.3 # via matplotlib pyspark==4.0.0 @@ -137,7 +152,7 @@ python-dateutil==2.9.0.post0 # via # matplotlib # pandas -python-dotenv==1.1.0 +python-dotenv==1.1.1 # via pydantic-settings pytz==2025.2 # via pandas @@ -145,39 +160,43 @@ pyyaml==6.0.2 # via # dask # ray -ray==2.46.0 +ray==2.47.1 # via modin referencing==0.36.2 # via # jsonschema # jsonschema-specifications -requests==2.32.3 +requests==2.32.4 # via ray -rpds-py==0.25.1 +rpds-py==0.26.0 # via # jsonschema # referencing +rsa==4.9.1 + # via pyexasol scipy==1.15.3 # via # mizani # plotnine # statsmodels -setuptools==80.8.0 +setuptools==80.9.0 # via -r requirements.in six==1.17.0 # via python-dateutil -statsmodels==0.14.4 +statsmodels==0.14.5 # via plotnine toolz==1.0.0 # via # dask # partd -tpchgen-cli==1.1.0 +tpchgen-cli==1.1.1 # via -r requirements.in -typing-extensions==4.13.2 +typing-extensions==4.14.1 # via + # polars-cloud # pydantic # pydantic-core + # pyopenssl # referencing # typing-inspection typing-inspection==0.4.1 @@ -186,5 +205,9 @@ typing-inspection==0.4.1 # pydantic-settings tzdata==2025.2 # via pandas -urllib3==2.4.0 +urllib3==2.5.0 # via requests +websocket-client==1.8.0 + # via pyexasol +zipp==3.23.0 + # via importlib-metadata diff --git a/scripts/plot_bars.py b/scripts/plot_bars.py index ea91713..8738a41 100644 --- a/scripts/plot_bars.py +++ b/scripts/plot_bars.py @@ -28,6 +28,7 @@ "polars": "#0075FF", "polars-eager": "#00B4D8", "duckdb": "#80B9C8", + "exasol": "#30CC30", "pyspark": "#C29470", "dask": "#77D487", "pandas": "#2B8C5D", @@ -38,6 +39,7 @@ "polars": "Polars", "polars-eager": "Polars - eager", "duckdb": "DuckDB", + "exasol": "Exasol", "pandas": "pandas", "dask": "Dask", "modin": "Modin", diff --git a/settings.py b/settings.py index e2ad533..37cd065 100644 --- a/settings.py +++ b/settings.py @@ -75,6 +75,20 @@ class Plot(BaseSettings): ) +class Exasol(BaseSettings): + """Settings for connecting to an Exasol database.""" + + host: str + port: int = 8563 + user: str + password: str + schema_name: str = "tpc" + + model_config = SettingsConfigDict( + env_prefix="exasol_", env_file=".env", extra="ignore" + ) + + class Settings(BaseSettings): scale_factor: float = 1.0 num_batches: int | None = None @@ -83,6 +97,11 @@ class Settings(BaseSettings): plot: Plot = Plot() run: Run = Run() + @computed_field # type: ignore[prop-decorator] + @property + def exasol(self) -> Exasol: + return Exasol() + @computed_field # type: ignore[prop-decorator] @property def dataset_base_dir(self) -> Path: