Skip to content

Commit 053d3a9

Browse files
wip - add data gen
1 parent 731ad2a commit 053d3a9

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

103 files changed

+9438
-1
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
/.idea
22
/target
33
/benchmarks/data/
4-
testdata/tpch/data/
4+
testdata/tpch/data/
5+
testdata/tpcds/data/

testdata/tpcds/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
This directory contains
2+
- 99 TPC-DS queries. Source: https://docs.snowflake.com/en/user-guide/sample-data-tpcds
3+
- `generate.sh` uses the duckdb CLI to install the TPC-DS dataset.

testdata/tpcds/generate.sh

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
#!/bin/bash
2+
3+
set -e
4+
5+
if [ $# -ne 1 ]; then
6+
echo "Usage: $0 <scale_factor>"
7+
echo "Scale factor must be greater than or equal to 0"
8+
exit 1
9+
fi
10+
11+
SCALE_FACTOR=$1
12+
13+
if ! [[ "$SCALE_FACTOR" =~ ^[0-9]+(\.[0-9]+)?$ ]] || (( $(echo "$SCALE_FACTOR < 0" | bc -l) )); then
14+
echo "Error: Scale factor must be a number greater than or equal to 0"
15+
exit 1
16+
fi
17+
18+
if ! command -v duckdb &> /dev/null; then
19+
echo "Error: duckdb CLI is not installed"
20+
echo "Please install duckdb: https://duckdb.org/docs/installation/"
21+
exit 1
22+
fi
23+
24+
echo "Clearing testdata/tpcds/data directory..."
25+
rm -rf testdata/tpcds/data
26+
mkdir -p testdata/tpcds/data
27+
28+
echo "Removing existing database file..."
29+
rm -f tpcds.duckdb
30+
31+
echo "Generating TPC-DS data with scale factor $SCALE_FACTOR..."
32+
duckdb tpcds.duckdb -c "INSTALL tpcds; LOAD tpcds; CALL dsdgen(sf=$SCALE_FACTOR);"
33+
34+
echo "Exporting tables to parquet files..."
35+
duckdb tpcds.duckdb << 'EOF'
36+
-- SALES (often the largest tables)
37+
COPY store_sales TO 'testdata/tpcds/data/store_sales'
38+
(FORMAT PARQUET, PARTITION_BY (ss_sold_date_sk), COMPRESSION UNCOMPRESSED);
39+
COPY web_sales TO 'testdata/tpcds/data/web_sales'
40+
(FORMAT PARQUET, PARTITION_BY (ws_sold_date_sk), COMPRESSION UNCOMPRESSED);
41+
COPY catalog_sales TO 'testdata/tpcds/data/catalog_sales'
42+
(FORMAT PARQUET, PARTITION_BY (cs_sold_date_sk), COMPRESSION UNCOMPRESSED);
43+
44+
-- RETURNS (match the *_sold_date_sk where available)
45+
COPY store_returns TO 'testdata/tpcds/data/store_returns'
46+
(FORMAT PARQUET, PARTITION_BY (sr_returned_date_sk), COMPRESSION UNCOMPRESSED);
47+
COPY web_returns TO 'testdata/tpcds/data/web_returns'
48+
(FORMAT PARQUET, PARTITION_BY (wr_returned_date_sk), COMPRESSION UNCOMPRESSED);
49+
COPY catalog_returns TO 'testdata/tpcds/data/catalog_returns'
50+
(FORMAT PARQUET, PARTITION_BY (cr_returned_date_sk), COMPRESSION UNCOMPRESSED);
51+
52+
-- Dimensions & fact tables: all partitioned
53+
COPY date_dim TO 'testdata/tpcds/data/date_dim' (FORMAT PARQUET, PARTITION_BY (d_year), COMPRESSION UNCOMPRESSED);
54+
COPY time_dim TO 'testdata/tpcds/data/time_dim' (FORMAT PARQUET, PARTITION_BY (t_hour), COMPRESSION UNCOMPRESSED);
55+
COPY item TO 'testdata/tpcds/data/item' (FORMAT PARQUET, PARTITION_BY (i_category_id), COMPRESSION UNCOMPRESSED);
56+
COPY store TO 'testdata/tpcds/data/store' (FORMAT PARQUET, PARTITION_BY (s_state), COMPRESSION UNCOMPRESSED);
57+
COPY web_site TO 'testdata/tpcds/data/web_site' (FORMAT PARQUET, PARTITION_BY (web_company_id), COMPRESSION UNCOMPRESSED);
58+
COPY web_page TO 'testdata/tpcds/data/web_page' (FORMAT PARQUET, PARTITION_BY (wp_web_page_sk), COMPRESSION UNCOMPRESSED);
59+
COPY warehouse TO 'testdata/tpcds/data/warehouse' (FORMAT PARQUET, PARTITION_BY (w_state), COMPRESSION UNCOMPRESSED);
60+
COPY ship_mode TO 'testdata/tpcds/data/ship_mode' (FORMAT PARQUET, PARTITION_BY (sm_type), COMPRESSION UNCOMPRESSED);
61+
COPY promotion TO 'testdata/tpcds/data/promotion' (FORMAT PARQUET, PARTITION_BY (p_channel_catalog), COMPRESSION UNCOMPRESSED);
62+
COPY customer TO 'testdata/tpcds/data/customer' (FORMAT PARQUET, PARTITION_BY (c_current_addr_sk), COMPRESSION UNCOMPRESSED);
63+
COPY customer_address TO 'testdata/tpcds/data/customer_address' (FORMAT PARQUET, PARTITION_BY (ca_state), COMPRESSION UNCOMPRESSED);
64+
COPY customer_demographics TO 'testdata/tpcds/data/customer_demographics' (FORMAT PARQUET, PARTITION_BY (cd_gender), COMPRESSION UNCOMPRESSED);
65+
COPY household_demographics TO 'testdata/tpcds/data/household_demographics' (FORMAT PARQUET, PARTITION_BY (hd_income_band_sk), COMPRESSION UNCOMPRESSED);
66+
COPY income_band TO 'testdata/tpcds/data/income_band' (FORMAT PARQUET, PARTITION_BY (ib_lower_bound), COMPRESSION UNCOMPRESSED);
67+
COPY reason TO 'testdata/tpcds/data/reason' (FORMAT PARQUET, PARTITION_BY (r_reason_desc), COMPRESSION UNCOMPRESSED);
68+
COPY catalog_page TO 'testdata/tpcds/data/catalog_page' (FORMAT PARQUET, PARTITION_BY (cp_catalog_page_sk), COMPRESSION UNCOMPRESSED);
69+
COPY inventory TO 'testdata/tpcds/data/inventory' (FORMAT PARQUET, PARTITION_BY (inv_date_sk), COMPRESSION UNCOMPRESSED);
70+
COPY call_center TO 'testdata/tpcds/data/call_center' (FORMAT PARQUET, PARTITION_BY (cc_state), COMPRESSION UNCOMPRESSED);
71+
EOF
72+
73+
echo "Cleaning up temporary database..."
74+
rm -f tpcds.duckdb
75+
76+
echo "TPC-DS data generation complete!"

testdata/tpcds/queries/q1.sql

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
select /* { "query":"query96","streamId":0,"querySequence":1 } */ count(*)
2+
from store_sales
3+
,household_demographics
4+
,time_dim, store
5+
where ss_sold_time_sk = time_dim.t_time_sk
6+
and ss_hdemo_sk = household_demographics.hd_demo_sk
7+
and ss_store_sk = s_store_sk
8+
and time_dim.t_hour = 8
9+
and time_dim.t_minute >= 30
10+
and household_demographics.hd_dep_count = 5
11+
and store.s_store_name = 'ese'
12+
order by count(*)
13+
limit 100;

testdata/tpcds/queries/q10.sql

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
with /* { "query":"query78","streamId":0,"querySequence":10 } */ ws as
2+
(select d_year AS ws_sold_year, ws_item_sk,
3+
ws_bill_customer_sk ws_customer_sk,
4+
sum(ws_quantity) ws_qty,
5+
sum(ws_wholesale_cost) ws_wc,
6+
sum(ws_sales_price) ws_sp
7+
from web_sales
8+
left join web_returns on wr_order_number=ws_order_number and ws_item_sk=wr_item_sk
9+
join date_dim on ws_sold_date_sk = d_date_sk
10+
where wr_order_number is null
11+
group by d_year, ws_item_sk, ws_bill_customer_sk
12+
),
13+
cs as
14+
(select d_year AS cs_sold_year, cs_item_sk,
15+
cs_bill_customer_sk cs_customer_sk,
16+
sum(cs_quantity) cs_qty,
17+
sum(cs_wholesale_cost) cs_wc,
18+
sum(cs_sales_price) cs_sp
19+
from catalog_sales
20+
left join catalog_returns on cr_order_number=cs_order_number and cs_item_sk=cr_item_sk
21+
join date_dim on cs_sold_date_sk = d_date_sk
22+
where cr_order_number is null
23+
group by d_year, cs_item_sk, cs_bill_customer_sk
24+
),
25+
ss as
26+
(select d_year AS ss_sold_year, ss_item_sk,
27+
ss_customer_sk,
28+
sum(ss_quantity) ss_qty,
29+
sum(ss_wholesale_cost) ss_wc,
30+
sum(ss_sales_price) ss_sp
31+
from store_sales
32+
left join store_returns on sr_ticket_number=ss_ticket_number and ss_item_sk=sr_item_sk
33+
join date_dim on ss_sold_date_sk = d_date_sk
34+
where sr_ticket_number is null
35+
group by d_year, ss_item_sk, ss_customer_sk
36+
)
37+
select
38+
ss_customer_sk,
39+
round(ss_qty/(coalesce(ws_qty,0)+coalesce(cs_qty,0)),2) ratio,
40+
ss_qty store_qty, ss_wc store_wholesale_cost, ss_sp store_sales_price,
41+
coalesce(ws_qty,0)+coalesce(cs_qty,0) other_chan_qty,
42+
coalesce(ws_wc,0)+coalesce(cs_wc,0) other_chan_wholesale_cost,
43+
coalesce(ws_sp,0)+coalesce(cs_sp,0) other_chan_sales_price
44+
from ss
45+
left join ws on (ws_sold_year=ss_sold_year and ws_item_sk=ss_item_sk and ws_customer_sk=ss_customer_sk)
46+
left join cs on (cs_sold_year=ss_sold_year and cs_item_sk=ss_item_sk and cs_customer_sk=ss_customer_sk)
47+
where (coalesce(ws_qty,0)>0 or coalesce(cs_qty, 0)>0) and ss_sold_year=2001
48+
order by
49+
ss_customer_sk,
50+
ss_qty desc, ss_wc desc, ss_sp desc,
51+
other_chan_qty,
52+
other_chan_wholesale_cost,
53+
other_chan_sales_price,
54+
ratio
55+
limit 100;

testdata/tpcds/queries/q11.sql

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
select /* { "query":"query86","streamId":0,"querySequence":11 } */
2+
sum(ws_net_paid) as total_sum
3+
,i_category
4+
,i_class
5+
,grouping(i_category)+grouping(i_class) as lochierarchy
6+
,rank() over (
7+
partition by grouping(i_category)+grouping(i_class),
8+
case when grouping(i_class) = 0 then i_category end
9+
order by sum(ws_net_paid) desc) as rank_within_parent
10+
from
11+
web_sales
12+
,date_dim d1
13+
,item
14+
where
15+
d1.d_month_seq between 1205 and 1205+11
16+
and d1.d_date_sk = ws_sold_date_sk
17+
and i_item_sk = ws_item_sk
18+
group by rollup(i_category,i_class)
19+
order by
20+
lochierarchy desc,
21+
case when lochierarchy = 0 then i_category end,
22+
rank_within_parent
23+
limit 100;

testdata/tpcds/queries/q12.sql

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
with /* { "query":"query01","streamId":0,"querySequence":12 } */ customer_total_return as
2+
(select sr_customer_sk as ctr_customer_sk
3+
,sr_store_sk as ctr_store_sk
4+
,sum(SR_RETURN_AMT_INC_TAX) as ctr_total_return
5+
from store_returns
6+
,date_dim
7+
where sr_returned_date_sk = d_date_sk
8+
and d_year =1999
9+
group by sr_customer_sk
10+
,sr_store_sk)
11+
select c_customer_id
12+
from customer_total_return ctr1
13+
,store
14+
,customer
15+
where ctr1.ctr_total_return > (select avg(ctr_total_return)*1.2
16+
from customer_total_return ctr2
17+
where ctr1.ctr_store_sk = ctr2.ctr_store_sk)
18+
and s_store_sk = ctr1.ctr_store_sk
19+
and s_state = 'NM'
20+
and ctr1.ctr_customer_sk = c_customer_sk
21+
order by c_customer_id
22+
limit 100;

testdata/tpcds/queries/q13.sql

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
select /* { "query":"query91","streamId":0,"querySequence":13 } */
2+
cc_call_center_id Call_Center,
3+
cc_name Call_Center_Name,
4+
cc_manager Manager,
5+
sum(cr_net_loss) Returns_Loss
6+
from
7+
call_center,
8+
catalog_returns,
9+
date_dim,
10+
customer,
11+
customer_address,
12+
customer_demographics,
13+
household_demographics
14+
where
15+
cr_call_center_sk = cc_call_center_sk
16+
and cr_returned_date_sk = d_date_sk
17+
and cr_returning_customer_sk= c_customer_sk
18+
and cd_demo_sk = c_current_cdemo_sk
19+
and hd_demo_sk = c_current_hdemo_sk
20+
and ca_address_sk = c_current_addr_sk
21+
and d_year = 2002
22+
and d_moy = 11
23+
and ( (cd_marital_status = 'M' and cd_education_status = 'Unknown')
24+
or(cd_marital_status = 'W' and cd_education_status = 'Advanced Degree'))
25+
and hd_buy_potential like 'Unknown%'
26+
and ca_gmt_offset = -6
27+
group by cc_call_center_id,cc_name,cc_manager,cd_marital_status,cd_education_status
28+
order by sum(cr_net_loss) desc;

testdata/tpcds/queries/q14.sql

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
select /* { "query":"query21","streamId":0,"querySequence":14 } */ *
2+
from(select w_warehouse_name
3+
,i_item_id
4+
,sum(case when (cast(d_date as date) < cast ('2000-05-19' as date))
5+
then inv_quantity_on_hand
6+
else 0 end) as inv_before
7+
,sum(case when (cast(d_date as date) >= cast ('2000-05-19' as date))
8+
then inv_quantity_on_hand
9+
else 0 end) as inv_after
10+
from inventory
11+
,warehouse
12+
,item
13+
,date_dim
14+
where i_current_price between 0.99 and 1.49
15+
and i_item_sk = inv_item_sk
16+
and inv_warehouse_sk = w_warehouse_sk
17+
and inv_date_sk = d_date_sk
18+
and d_date between dateadd(day,-30,to_date('2000-05-19'))
19+
and dateadd(day,30,to_date('2000-05-19'))
20+
group by w_warehouse_name, i_item_id) x
21+
where (case when inv_before > 0
22+
then inv_after / inv_before
23+
else null
24+
end) between 2.0/3.0 and 3.0/2.0
25+
order by w_warehouse_name
26+
,i_item_id
27+
limit 100;

testdata/tpcds/queries/q15.sql

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
select /* { "query":"query43","streamId":0,"querySequence":15 } */ s_store_name, s_store_id,
2+
sum(case when (d_day_name='Sunday') then ss_sales_price else null end) sun_sales,
3+
sum(case when (d_day_name='Monday') then ss_sales_price else null end) mon_sales,
4+
sum(case when (d_day_name='Tuesday') then ss_sales_price else null end) tue_sales,
5+
sum(case when (d_day_name='Wednesday') then ss_sales_price else null end) wed_sales,
6+
sum(case when (d_day_name='Thursday') then ss_sales_price else null end) thu_sales,
7+
sum(case when (d_day_name='Friday') then ss_sales_price else null end) fri_sales,
8+
sum(case when (d_day_name='Saturday') then ss_sales_price else null end) sat_sales
9+
from date_dim, store_sales, store
10+
where d_date_sk = ss_sold_date_sk and
11+
s_store_sk = ss_store_sk and
12+
s_gmt_offset = -6 and
13+
d_year = 2000
14+
group by s_store_name, s_store_id
15+
order by s_store_name, s_store_id,sun_sales,mon_sales,tue_sales,wed_sales,thu_sales,fri_sales,sat_sales
16+
limit 100;

0 commit comments

Comments
 (0)