1+ #! /bin/bash
2+
3+ set -e
4+
5+ if [ $# -ne 1 ]; then
6+ echo " Usage: $0 <scale_factor>"
7+ echo " Scale factor must be greater than or equal to 0"
8+ exit 1
9+ fi
10+
11+ SCALE_FACTOR=$1
12+
13+ if ! [[ " $SCALE_FACTOR " =~ ^[0-9]+ (\. [0-9]+)? $ ]] || (( $(echo "$SCALE_FACTOR < 0 " | bc - l) )) ; then
14+ echo " Error: Scale factor must be a number greater than or equal to 0"
15+ exit 1
16+ fi
17+
18+ if ! command -v duckdb & > /dev/null; then
19+ echo " Error: duckdb CLI is not installed"
20+ echo " Please install duckdb: https://duckdb.org/docs/installation/"
21+ exit 1
22+ fi
23+
24+ echo " Clearing testdata/tpcds/data directory..."
25+ rm -rf testdata/tpcds/data
26+ mkdir -p testdata/tpcds/data
27+
28+ echo " Removing existing database file..."
29+ rm -f tpcds.duckdb
30+
31+ echo " Generating TPC-DS data with scale factor $SCALE_FACTOR ..."
32+ duckdb tpcds.duckdb -c " INSTALL tpcds; LOAD tpcds; CALL dsdgen(sf=$SCALE_FACTOR );"
33+
34+ echo " Exporting tables to parquet files..."
35+ duckdb tpcds.duckdb << 'EOF '
36+ -- SALES (often the largest tables)
37+ COPY store_sales TO 'testdata/tpcds/data/store_sales'
38+ (FORMAT PARQUET, PARTITION_BY (ss_sold_date_sk), COMPRESSION UNCOMPRESSED);
39+ COPY web_sales TO 'testdata/tpcds/data/web_sales'
40+ (FORMAT PARQUET, PARTITION_BY (ws_sold_date_sk), COMPRESSION UNCOMPRESSED);
41+ COPY catalog_sales TO 'testdata/tpcds/data/catalog_sales'
42+ (FORMAT PARQUET, PARTITION_BY (cs_sold_date_sk), COMPRESSION UNCOMPRESSED);
43+
44+ -- RETURNS (match the *_sold_date_sk where available)
45+ COPY store_returns TO 'testdata/tpcds/data/store_returns'
46+ (FORMAT PARQUET, PARTITION_BY (sr_returned_date_sk), COMPRESSION UNCOMPRESSED);
47+ COPY web_returns TO 'testdata/tpcds/data/web_returns'
48+ (FORMAT PARQUET, PARTITION_BY (wr_returned_date_sk), COMPRESSION UNCOMPRESSED);
49+ COPY catalog_returns TO 'testdata/tpcds/data/catalog_returns'
50+ (FORMAT PARQUET, PARTITION_BY (cr_returned_date_sk), COMPRESSION UNCOMPRESSED);
51+
52+ -- Dimensions & fact tables: all partitioned
53+ COPY date_dim TO 'testdata/tpcds/data/date_dim' (FORMAT PARQUET, PARTITION_BY (d_year), COMPRESSION UNCOMPRESSED);
54+ COPY time_dim TO 'testdata/tpcds/data/time_dim' (FORMAT PARQUET, PARTITION_BY (t_hour), COMPRESSION UNCOMPRESSED);
55+ COPY item TO 'testdata/tpcds/data/item' (FORMAT PARQUET, PARTITION_BY (i_category_id), COMPRESSION UNCOMPRESSED);
56+ COPY store TO 'testdata/tpcds/data/store' (FORMAT PARQUET, PARTITION_BY (s_state), COMPRESSION UNCOMPRESSED);
57+ COPY web_site TO 'testdata/tpcds/data/web_site' (FORMAT PARQUET, PARTITION_BY (web_company_id), COMPRESSION UNCOMPRESSED);
58+ COPY web_page TO 'testdata/tpcds/data/web_page' (FORMAT PARQUET, PARTITION_BY (wp_web_page_sk), COMPRESSION UNCOMPRESSED);
59+ COPY warehouse TO 'testdata/tpcds/data/warehouse' (FORMAT PARQUET, PARTITION_BY (w_state), COMPRESSION UNCOMPRESSED);
60+ COPY ship_mode TO 'testdata/tpcds/data/ship_mode' (FORMAT PARQUET, PARTITION_BY (sm_type), COMPRESSION UNCOMPRESSED);
61+ COPY promotion TO 'testdata/tpcds/data/promotion' (FORMAT PARQUET, PARTITION_BY (p_channel_catalog), COMPRESSION UNCOMPRESSED);
62+ COPY customer TO 'testdata/tpcds/data/customer' (FORMAT PARQUET, PARTITION_BY (c_current_addr_sk), COMPRESSION UNCOMPRESSED);
63+ COPY customer_address TO 'testdata/tpcds/data/customer_address' (FORMAT PARQUET, PARTITION_BY (ca_state), COMPRESSION UNCOMPRESSED);
64+ COPY customer_demographics TO 'testdata/tpcds/data/customer_demographics' (FORMAT PARQUET, PARTITION_BY (cd_gender), COMPRESSION UNCOMPRESSED);
65+ COPY household_demographics TO 'testdata/tpcds/data/household_demographics' (FORMAT PARQUET, PARTITION_BY (hd_income_band_sk), COMPRESSION UNCOMPRESSED);
66+ COPY income_band TO 'testdata/tpcds/data/income_band' (FORMAT PARQUET, PARTITION_BY (ib_lower_bound), COMPRESSION UNCOMPRESSED);
67+ COPY reason TO 'testdata/tpcds/data/reason' (FORMAT PARQUET, PARTITION_BY (r_reason_desc), COMPRESSION UNCOMPRESSED);
68+ COPY catalog_page TO 'testdata/tpcds/data/catalog_page' (FORMAT PARQUET, PARTITION_BY (cp_catalog_page_sk), COMPRESSION UNCOMPRESSED);
69+ COPY inventory TO 'testdata/tpcds/data/inventory' (FORMAT PARQUET, PARTITION_BY (inv_date_sk), COMPRESSION UNCOMPRESSED);
70+ COPY call_center TO 'testdata/tpcds/data/call_center' (FORMAT PARQUET, PARTITION_BY (cc_state), COMPRESSION UNCOMPRESSED);
71+ EOF
72+
73+ echo " Cleaning up temporary database..."
74+ rm -f tpcds.duckdb
75+
76+ echo " TPC-DS data generation complete!"
0 commit comments