Skip to content

Commit 1913ade

Browse files
committed
Port TPC-DS benchmarks
Signed-off-by: Abhi Agarwal <[email protected]>
1 parent 2cca8c9 commit 1913ade

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

108 files changed

+5340
-20
lines changed

crates/benchmarks/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,3 +31,7 @@ harness = false
3131
[[bench]]
3232
name = "smoke"
3333
harness = false
34+
35+
[[bench]]
36+
name = "tpcds"
37+
harness = false

crates/benchmarks/README.md

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -62,21 +62,6 @@ List cases with:
6262
cargo run --release -p delta-benchmarks -- merge --case single_insert_only_filesMatchedFraction_0.05_rowsNotMatchedFraction_0.05
6363
```
6464

65-
## TPC-DS query helper
66-
67-
All 99 TPC-DS SQL statements (matching the Spark benchmark suite) are stored under `queries/tpcds`. The CLI can list or print them:
68-
69-
```bash
70-
cargo run --release -p delta-benchmarks -- tpcds --list
71-
cargo run --release -p delta-benchmarks -- tpcds --case q1
72-
```
73-
74-
There is also a micro-benchmark that iterates over every query string to ensure the include paths stay wired correctly:
75-
76-
```bash
77-
cargo bench -p delta-benchmarks --bench tpcds
78-
```
79-
8065
### Flamegraphs using `samply`
8166

8267
Using `samply`, you can generate flamegraphs from the profile script.

crates/benchmarks/benches/tpcds.rs

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
use std::path::PathBuf;
2+
3+
use delta_benchmarks::{register_tpcds_tables, tpcds_query, tpcds_query_names};
4+
use divan::{AllocProfiler, Bencher};
5+
6+
fn main() {
7+
divan::main();
8+
}
9+
10+
#[global_allocator]
11+
static ALLOC: AllocProfiler = AllocProfiler::system();
12+
13+
#[divan::bench(args = tpcds_query_names())]
14+
fn tpcds_query_execution(bencher: Bencher, name: &'static str) {
15+
let rt = tokio::runtime::Runtime::new().unwrap();
16+
let sql = tpcds_query(name)
17+
.expect("query must exist")
18+
.split(";")
19+
.filter(|s| !s.trim().is_empty())
20+
.collect::<Vec<_>>();
21+
22+
let tmp_dir = tempfile::tempdir().unwrap();
23+
let parquet_dir = PathBuf::from(
24+
std::env::var("TPCDS_PARQUET_DIR").unwrap_or_else(|_| "data/tpcds_parquet".to_string()),
25+
);
26+
27+
let ctx = rt.block_on(async {
28+
register_tpcds_tables(&tmp_dir, &parquet_dir)
29+
.await
30+
.expect("failed to register TPC-DS tables")
31+
});
32+
33+
bencher.bench_local(|| {
34+
rt.block_on(async {
35+
for sql in sql.iter() {
36+
let df = ctx.sql(sql).await.expect("failed to create dataframe");
37+
divan::black_box(df.collect().await.expect("failed to execute query"));
38+
}
39+
});
40+
});
41+
drop(tmp_dir);
42+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
TPC-DS SQL is vendored from [datafusion-benchmarks](https://github.com/apache/datafusion-benchmarks) repository.
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
-- SQLBench-DS query 1 derived from TPC-DS query 1 under the terms of the TPC Fair Use Policy.
2+
-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
3+
-- This query was generated at scale factor 1.
4+
with customer_total_return as
5+
(select sr_customer_sk as ctr_customer_sk
6+
,sr_store_sk as ctr_store_sk
7+
,sum(SR_RETURN_AMT_INC_TAX) as ctr_total_return
8+
from store_returns
9+
,date_dim
10+
where sr_returned_date_sk = d_date_sk
11+
and d_year =1999
12+
group by sr_customer_sk
13+
,sr_store_sk)
14+
select c_customer_id
15+
from customer_total_return ctr1
16+
,store
17+
,customer
18+
where ctr1.ctr_total_return > (select avg(ctr_total_return)*1.2
19+
from customer_total_return ctr2
20+
where ctr1.ctr_store_sk = ctr2.ctr_store_sk)
21+
and s_store_sk = ctr1.ctr_store_sk
22+
and s_state = 'TN'
23+
and ctr1.ctr_customer_sk = c_customer_sk
24+
order by c_customer_id
25+
LIMIT 100;
26+
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
-- SQLBench-DS query 10 derived from TPC-DS query 10 under the terms of the TPC Fair Use Policy.
2+
-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
3+
-- This query was generated at scale factor 1.
4+
select
5+
cd_gender,
6+
cd_marital_status,
7+
cd_education_status,
8+
count(*) cnt1,
9+
cd_purchase_estimate,
10+
count(*) cnt2,
11+
cd_credit_rating,
12+
count(*) cnt3,
13+
cd_dep_count,
14+
count(*) cnt4,
15+
cd_dep_employed_count,
16+
count(*) cnt5,
17+
cd_dep_college_count,
18+
count(*) cnt6
19+
from
20+
customer c,customer_address ca,customer_demographics
21+
where
22+
c.c_current_addr_sk = ca.ca_address_sk and
23+
ca_county in ('Clinton County','Platte County','Franklin County','Louisa County','Harmon County') and
24+
cd_demo_sk = c.c_current_cdemo_sk and
25+
exists (select *
26+
from store_sales,date_dim
27+
where c.c_customer_sk = ss_customer_sk and
28+
ss_sold_date_sk = d_date_sk and
29+
d_year = 2002 and
30+
d_moy between 3 and 3+3) and
31+
(exists (select *
32+
from web_sales,date_dim
33+
where c.c_customer_sk = ws_bill_customer_sk and
34+
ws_sold_date_sk = d_date_sk and
35+
d_year = 2002 and
36+
d_moy between 3 ANd 3+3) or
37+
exists (select *
38+
from catalog_sales,date_dim
39+
where c.c_customer_sk = cs_ship_customer_sk and
40+
cs_sold_date_sk = d_date_sk and
41+
d_year = 2002 and
42+
d_moy between 3 and 3+3))
43+
group by cd_gender,
44+
cd_marital_status,
45+
cd_education_status,
46+
cd_purchase_estimate,
47+
cd_credit_rating,
48+
cd_dep_count,
49+
cd_dep_employed_count,
50+
cd_dep_college_count
51+
order by cd_gender,
52+
cd_marital_status,
53+
cd_education_status,
54+
cd_purchase_estimate,
55+
cd_credit_rating,
56+
cd_dep_count,
57+
cd_dep_employed_count,
58+
cd_dep_college_count
59+
LIMIT 100;
60+
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
-- SQLBench-DS query 11 derived from TPC-DS query 11 under the terms of the TPC Fair Use Policy.
2+
-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
3+
-- This query was generated at scale factor 1.
4+
with year_total as (
5+
select c_customer_id customer_id
6+
,c_first_name customer_first_name
7+
,c_last_name customer_last_name
8+
,c_preferred_cust_flag customer_preferred_cust_flag
9+
,c_birth_country customer_birth_country
10+
,c_login customer_login
11+
,c_email_address customer_email_address
12+
,d_year dyear
13+
,sum(ss_ext_list_price-ss_ext_discount_amt) year_total
14+
,'s' sale_type
15+
from customer
16+
,store_sales
17+
,date_dim
18+
where c_customer_sk = ss_customer_sk
19+
and ss_sold_date_sk = d_date_sk
20+
group by c_customer_id
21+
,c_first_name
22+
,c_last_name
23+
,c_preferred_cust_flag
24+
,c_birth_country
25+
,c_login
26+
,c_email_address
27+
,d_year
28+
union all
29+
select c_customer_id customer_id
30+
,c_first_name customer_first_name
31+
,c_last_name customer_last_name
32+
,c_preferred_cust_flag customer_preferred_cust_flag
33+
,c_birth_country customer_birth_country
34+
,c_login customer_login
35+
,c_email_address customer_email_address
36+
,d_year dyear
37+
,sum(ws_ext_list_price-ws_ext_discount_amt) year_total
38+
,'w' sale_type
39+
from customer
40+
,web_sales
41+
,date_dim
42+
where c_customer_sk = ws_bill_customer_sk
43+
and ws_sold_date_sk = d_date_sk
44+
group by c_customer_id
45+
,c_first_name
46+
,c_last_name
47+
,c_preferred_cust_flag
48+
,c_birth_country
49+
,c_login
50+
,c_email_address
51+
,d_year
52+
)
53+
select
54+
t_s_secyear.customer_id
55+
,t_s_secyear.customer_first_name
56+
,t_s_secyear.customer_last_name
57+
,t_s_secyear.customer_email_address
58+
from year_total t_s_firstyear
59+
,year_total t_s_secyear
60+
,year_total t_w_firstyear
61+
,year_total t_w_secyear
62+
where t_s_secyear.customer_id = t_s_firstyear.customer_id
63+
and t_s_firstyear.customer_id = t_w_secyear.customer_id
64+
and t_s_firstyear.customer_id = t_w_firstyear.customer_id
65+
and t_s_firstyear.sale_type = 's'
66+
and t_w_firstyear.sale_type = 'w'
67+
and t_s_secyear.sale_type = 's'
68+
and t_w_secyear.sale_type = 'w'
69+
and t_s_firstyear.dyear = 1999
70+
and t_s_secyear.dyear = 1999+1
71+
and t_w_firstyear.dyear = 1999
72+
and t_w_secyear.dyear = 1999+1
73+
and t_s_firstyear.year_total > 0
74+
and t_w_firstyear.year_total > 0
75+
and case when t_w_firstyear.year_total > 0 then t_w_secyear.year_total / t_w_firstyear.year_total else 0.0 end
76+
> case when t_s_firstyear.year_total > 0 then t_s_secyear.year_total / t_s_firstyear.year_total else 0.0 end
77+
order by t_s_secyear.customer_id
78+
,t_s_secyear.customer_first_name
79+
,t_s_secyear.customer_last_name
80+
,t_s_secyear.customer_email_address
81+
LIMIT 100;
82+
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
-- SQLBench-DS query 12 derived from TPC-DS query 12 under the terms of the TPC Fair Use Policy.
2+
-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
3+
-- This query was generated at scale factor 1.
4+
select i_item_id
5+
,i_item_desc
6+
,i_category
7+
,i_class
8+
,i_current_price
9+
,sum(ws_ext_sales_price) as itemrevenue
10+
,sum(ws_ext_sales_price)*100/sum(sum(ws_ext_sales_price)) over
11+
(partition by i_class) as revenueratio
12+
from
13+
web_sales
14+
,item
15+
,date_dim
16+
where
17+
ws_item_sk = i_item_sk
18+
and i_category in ('Jewelry', 'Books', 'Women')
19+
and ws_sold_date_sk = d_date_sk
20+
and d_date between cast('2002-03-22' as date)
21+
and (cast('2002-03-22' as date) + INTERVAL '30 DAYS')
22+
group by
23+
i_item_id
24+
,i_item_desc
25+
,i_category
26+
,i_class
27+
,i_current_price
28+
order by
29+
i_category
30+
,i_class
31+
,i_item_id
32+
,i_item_desc
33+
,revenueratio
34+
LIMIT 100;
35+
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
-- SQLBench-DS query 13 derived from TPC-DS query 13 under the terms of the TPC Fair Use Policy.
2+
-- TPC-DS queries are Copyright 2021 Transaction Processing Performance Council.
3+
-- This query was generated at scale factor 1.
4+
select avg(ss_quantity)
5+
,avg(ss_ext_sales_price)
6+
,avg(ss_ext_wholesale_cost)
7+
,sum(ss_ext_wholesale_cost)
8+
from store_sales
9+
,store
10+
,customer_demographics
11+
,household_demographics
12+
,customer_address
13+
,date_dim
14+
where s_store_sk = ss_store_sk
15+
and ss_sold_date_sk = d_date_sk and d_year = 2001
16+
and((ss_hdemo_sk=hd_demo_sk
17+
and cd_demo_sk = ss_cdemo_sk
18+
and cd_marital_status = 'U'
19+
and cd_education_status = '4 yr Degree'
20+
and ss_sales_price between 100.00 and 150.00
21+
and hd_dep_count = 3
22+
)or
23+
(ss_hdemo_sk=hd_demo_sk
24+
and cd_demo_sk = ss_cdemo_sk
25+
and cd_marital_status = 'S'
26+
and cd_education_status = 'Unknown'
27+
and ss_sales_price between 50.00 and 100.00
28+
and hd_dep_count = 1
29+
) or
30+
(ss_hdemo_sk=hd_demo_sk
31+
and cd_demo_sk = ss_cdemo_sk
32+
and cd_marital_status = 'D'
33+
and cd_education_status = '2 yr Degree'
34+
and ss_sales_price between 150.00 and 200.00
35+
and hd_dep_count = 1
36+
))
37+
and((ss_addr_sk = ca_address_sk
38+
and ca_country = 'United States'
39+
and ca_state in ('CO', 'MI', 'MN')
40+
and ss_net_profit between 100 and 200
41+
) or
42+
(ss_addr_sk = ca_address_sk
43+
and ca_country = 'United States'
44+
and ca_state in ('NC', 'NY', 'TX')
45+
and ss_net_profit between 150 and 300
46+
) or
47+
(ss_addr_sk = ca_address_sk
48+
and ca_country = 'United States'
49+
and ca_state in ('CA', 'NE', 'TN')
50+
and ss_net_profit between 50 and 250
51+
))
52+
;
53+

0 commit comments

Comments
 (0)