Skip to content

Commit 9d8c6dd

Browse files
authored
chore(optimizer): spill out statistics yaml in tpcds_test (#17784)
spill out statistics yaml
1 parent edcf7f5 commit 9d8c6dd

File tree

4 files changed

+463
-168
lines changed

4 files changed

+463
-168
lines changed

src/query/service/tests/it/sql/planner/optimizer/data/yaml/q1.yaml renamed to src/query/service/tests/it/sql/planner/optimizer/data/cases/q1.yaml

Lines changed: 2 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -25,93 +25,8 @@ sql: |
2525
ORDER BY c_customer_id
2626
LIMIT 100
2727
28-
# Table statistics derived from snow_plan's TableScan information
29-
table_statistics:
30-
date_dim:
31-
num_rows: 73049 # Estimated based on typical date dimension cardinality
32-
data_size: 2138624 # Directly from snow_plan: "bytes: 2,138,624"
33-
number_of_segments: 1 # From snow_plan: "partitions: 1/1"
34-
35-
store_returns:
36-
num_rows: 287000000 # Estimated based on data size and typical row size
37-
data_size: 124763446272 # Directly from snow_plan: "bytes: 124,763,446,272"
38-
number_of_segments: 7070 # From snow_plan: "partitions: 7070/7070"
39-
40-
store:
41-
num_rows: 1002 # Estimated based on typical store dimension cardinality
42-
data_size: 135680 # Directly from snow_plan: "bytes: 135,680"
43-
number_of_segments: 1 # From snow_plan: "partitions: 1/1"
44-
45-
customer:
46-
num_rows: 12000000 # Estimated based on typical customer dimension size
47-
data_size: 2328538624 # Directly from snow_plan: "bytes: 2,328,538,624"
48-
number_of_segments: 261 # From snow_plan: "partitions: 261/261"
49-
50-
# Column statistics derived from query predicates and typical TPC-DS data distributions
51-
column_statistics:
52-
# Date dimension columns used in the query
53-
date_dim.d_year:
54-
min: 1990 # Typical range for TPC-DS
55-
max: 2010 # Typical range for TPC-DS
56-
ndv: 21 # Based on min/max range (2010-1990+1)
57-
null_count: 0 # Primary dimension columns typically don't have nulls
58-
59-
date_dim.d_date_sk:
60-
min: 1 # Typical starting value for surrogate key
61-
max: 73049 # Based on table row count
62-
ndv: 73049 # Primary key, so NDV equals row count
63-
null_count: 0 # Primary key cannot be null
64-
65-
# Store returns columns used in the query
66-
store_returns.sr_returned_date_sk:
67-
min: 1 # Matches date_dim.d_date_sk min
68-
max: 73049 # Matches date_dim.d_date_sk max
69-
ndv: 73049 # Foreign key to date_dim
70-
null_count: 287998 # Inferred from filter in snow_plan: "STORE_RETURNS.SR_RETURNED_DATE_SK IS NOT NULL"
71-
72-
store_returns.sr_customer_sk:
73-
min: 1 # Typical starting value for surrogate key
74-
max: 12000000 # Matches customer.c_customer_sk max
75-
ndv: 11000000 # Estimated as slightly less than customer table cardinality
76-
null_count: 143500 # Inferred from filter in snow_plan: "STORE_RETURNS.SR_CUSTOMER_SK IS NOT NULL"
77-
78-
store_returns.sr_store_sk:
79-
min: 1 # Typical starting value for surrogate key
80-
max: 1002 # Matches store.s_store_sk max
81-
ndv: 1002 # Foreign key to store table
82-
null_count: 143500 # Inferred from filter in snow_plan: "STORE_RETURNS.SR_STORE_SK IS NOT NULL"
83-
84-
store_returns.sr_return_amt:
85-
min: 0.01 # Minimum reasonable return amount
86-
max: 10000.00 # Maximum reasonable return amount
87-
ndv: 100000 # Estimated based on typical distribution
88-
null_count: 0 # Return amount is typically not null
89-
90-
# Store columns used in the query
91-
store.s_store_sk:
92-
min: 1 # Typical starting value for surrogate key
93-
max: 1002 # Based on estimated row count
94-
ndv: 1002 # Primary key, so NDV equals row count
95-
null_count: 0 # Primary key cannot be null
96-
97-
store.s_state:
98-
min: "AK" # Alaska (alphabetically first US state)
99-
max: "WY" # Wyoming (alphabetically last US state)
100-
ndv: 50 # Number of US states
101-
null_count: 0 # State is typically not null
102-
103-
# Customer columns used in the query
104-
customer.c_customer_sk:
105-
min: 1 # Typical starting value for surrogate key
106-
max: 12000000 # Based on estimated row count
107-
ndv: 12000000 # Primary key, so NDV equals row count
108-
null_count: 0 # Primary key cannot be null
109-
110-
customer.c_customer_id:
111-
min: "AAAAAAAAAAAAAA" # Lexicographically smallest possible customer ID
112-
max: "ZZZZZZZZZZZZZZ" # Lexicographically largest possible customer ID
113-
ndv: 12000000 # Same as c_customer_sk (1:1 relationship)
114-
null_count: 0 # Customer ID is typically not null
28+
# Reference to external statistics file
29+
statistics_file: statistics.yaml
11530

11631
raw_plan: |
11732
Limit

src/query/service/tests/it/sql/planner/optimizer/data/yaml/q3.yaml renamed to src/query/service/tests/it/sql/planner/optimizer/data/cases/q3.yaml

Lines changed: 2 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -13,80 +13,8 @@ sql: |
1313
ORDER BY dt.d_year, sum_agg DESC, brand_id
1414
LIMIT 100
1515
16-
table_statistics:
17-
date_dim:
18-
num_rows: 73049 # Estimated based on typical date dimension cardinality
19-
data_size: 2138624 # From snow_plan: "TableScan (DATE_DIM as DT) [partitions: 1/1, bytes: 2,138,624]"
20-
data_size_compressed: 1069312 # Estimated as 50% of data_size
21-
index_size: 427724 # Estimated as 20% of data_size
22-
number_of_blocks: 21 # Estimated based on data_size
23-
number_of_segments: 1 # From snow_plan: "partitions: 1/1"
24-
store_sales:
25-
num_rows: 2879987999 # Estimated based on data size and typical row size
26-
data_size: 1212628258304 # From snow_plan: "TableScan (STORE_SALES) [partitions: 70,412/72,718, bytes: 1,212,628,258,304]"
27-
data_size_compressed: 606314129152 # Estimated as 50% of data_size
28-
index_size: 242525651660 # Estimated as 20% of data_size
29-
number_of_blocks: 12126282 # Estimated based on data_size
30-
number_of_segments: 70412 # From snow_plan: "partitions: 70,412/72,718"
31-
item:
32-
num_rows: 462000 # Estimated based on ss_item_sk range and typical item dimension size
33-
data_size: 23811584 # From snow_plan: "TableScan (ITEM) [partitions: 2/2, bytes: 23,811,584]"
34-
data_size_compressed: 11905792 # Estimated as 50% of data_size
35-
index_size: 4762316 # Estimated as 20% of data_size
36-
number_of_blocks: 238 # Estimated based on data_size
37-
number_of_segments: 2 # From snow_plan: "partitions: 2/2"
38-
39-
column_statistics:
40-
date_dim.d_year:
41-
min: 1990 # Typical range for TPC-DS
42-
max: 2000 # Typical range for TPC-DS
43-
ndv: 11 # Based on min/max range
44-
null_count: 0 # Primary dimension columns typically don't have nulls
45-
date_dim.d_date_sk:
46-
min: 1 # Typical starting value for surrogate key
47-
max: 73049 # Based on table row count
48-
ndv: 73049 # Primary key, so NDV equals row count
49-
null_count: 0 # Primary key cannot be null
50-
date_dim.d_moy:
51-
min: 1 # January
52-
max: 12 # December
53-
ndv: 12 # 12 months in a year
54-
null_count: 0 # Date parts typically don't have nulls
55-
store_sales.ss_ext_sales_price:
56-
min: 0.01 # Minimum reasonable sales price
57-
max: 30000.00 # Maximum reasonable extended sales price
58-
ndv: 573997 # Estimated as ~20% of row count
59-
null_count: 0 # Sales amount is typically not null
60-
store_sales.ss_sold_date_sk:
61-
min: 1 # Matches date_dim.d_date_sk min
62-
max: 73049 # Matches date_dim.d_date_sk max
63-
ndv: 73049 # Foreign key to date_dim
64-
null_count: 287998 # From snow_plan filter: "STORE_SALES.SS_SOLD_DATE_SK IS NOT NULL" implies some nulls exist
65-
store_sales.ss_item_sk:
66-
min: 1 # Typical starting value for surrogate key
67-
max: 462000 # Matches item.i_item_sk max
68-
ndv: 462000 # Foreign key to item table
69-
null_count: 0 # Required join key is typically not null
70-
item.i_brand_id:
71-
min: 1 # Typical starting value for ID
72-
max: 1000 # Typical range for TPC-DS
73-
ndv: 948 # Estimated based on TPC-DS typical cardinality
74-
null_count: 0 # Brand ID is typically not null
75-
item.i_brand:
76-
min: "AAAAAAAAAAAAAA" # Lexicographically smallest possible brand name
77-
max: "zzzzzzzzzzzzzz" # Lexicographically largest possible brand name
78-
ndv: 948 # Same as i_brand_id (1:1 relationship)
79-
null_count: 0 # Brand name is typically not null
80-
item.i_item_sk:
81-
min: 1 # Typical starting value for surrogate key
82-
max: 462000 # Based on estimated row count
83-
ndv: 462000 # Primary key, so NDV equals row count
84-
null_count: 0 # Primary key cannot be null
85-
item.i_manufact_id:
86-
min: 1 # Typical starting value for ID
87-
max: 1000 # Typical range for TPC-DS
88-
ndv: 1000 # Based on typical TPC-DS cardinality
89-
null_count: 0 # Manufacturer ID is typically not null
16+
# Reference to external statistics file
17+
statistics_file: statistics.yaml
9018

9119
raw_plan: |
9220
Limit

0 commit comments

Comments
 (0)