Skip to content

Commit f3a5c6e

Browse files
committed
Infer GROUP BY from SELECT columns when omitted
Queries can now omit GROUP BY entirely: SEMANTIC SELECT year, region, AGGREGATE(revenue) FROM sales_v; Yardstick infers the grouping columns from non-AGGREGATE expressions in the SELECT clause. This matches the paper's semantics more closely.
1 parent 658f830 commit f3a5c6e

File tree

2 files changed

+127
-12
lines changed

2 files changed

+127
-12
lines changed

README.md

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -58,17 +58,15 @@ SEMANTIC SELECT
5858
AGGREGATE(revenue) AS revenue,
5959
AGGREGATE(revenue) AT (ALL region) AS year_total,
6060
AGGREGATE(revenue) / AGGREGATE(revenue) AT (ALL region) AS pct_of_year
61-
FROM sales_v
62-
GROUP BY year, region;
61+
FROM sales_v;
6362

6463
-- Variance from the global average
6564
SEMANTIC SELECT
6665
region,
6766
AGGREGATE(revenue) AS revenue,
6867
AGGREGATE(revenue) AT (ALL) / 4.0 AS expected_if_equal, -- 4 regions
6968
AGGREGATE(revenue) - (AGGREGATE(revenue) AT (ALL) / 4.0) AS variance
70-
FROM sales_v
71-
GROUP BY region;
69+
FROM sales_v;
7270

7371
-- Nested percentages (% of year, and that year's % of total)
7472
SEMANTIC SELECT
@@ -77,26 +75,23 @@ SEMANTIC SELECT
7775
AGGREGATE(revenue) AS revenue,
7876
100.0 * AGGREGATE(revenue) / AGGREGATE(revenue) AT (ALL region) AS pct_of_year,
7977
100.0 * AGGREGATE(revenue) AT (ALL region) / AGGREGATE(revenue) AT (ALL) AS year_pct_of_total
80-
FROM sales_v
81-
GROUP BY year, region;
78+
FROM sales_v;
8279

8380
-- Compare 2024 performance to 2023 baseline for each region
8481
SEMANTIC SELECT
8582
region,
8683
AGGREGATE(revenue) AT (SET year = 2024) AS rev_2024,
8784
AGGREGATE(revenue) AT (SET year = 2023) AS rev_2023,
8885
AGGREGATE(revenue) AT (SET year = 2024) - AGGREGATE(revenue) AT (SET year = 2023) AS growth
89-
FROM sales_v
90-
GROUP BY region;
86+
FROM sales_v;
9187

9288
-- Filter to specific segments
9389
SEMANTIC SELECT
9490
year,
9591
AGGREGATE(revenue) AS total_revenue,
9692
AGGREGATE(revenue) AT (SET region = 'North') AS north_revenue,
9793
AGGREGATE(revenue) AT (SET region IN ('North', 'South')) AS north_south_combined
98-
FROM sales_v
99-
GROUP BY year;
94+
FROM sales_v;
10095
```
10196

10297
## Syntax
@@ -122,8 +117,7 @@ Queries using `AGGREGATE()` must use the `SEMANTIC` prefix:
122117
SEMANTIC SELECT
123118
dimensions,
124119
AGGREGATE(measure_name) [AT modifier]
125-
FROM view_name
126-
GROUP BY dimensions;
120+
FROM view_name;
127121
```
128122

129123
### AT Modifiers

yardstick-rs/src/sql/measures.rs

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2009,6 +2009,10 @@ pub fn expand_aggregate_with_at(sql: &str) -> AggregateExpandResult {
20092009
// Extract GROUP BY columns for AT (ALL dim) correlation
20102010
let group_by_cols = extract_group_by_columns(sql);
20112011

2012+
// Extract dimension columns from original SQL for implicit GROUP BY
2013+
// (must be done before expansion since expanded SQL has SUM() etc)
2014+
let original_dim_cols = extract_dimension_columns_from_select(sql);
2015+
20122016
// Check if any AT modifier needs correlation (for alias handling)
20132017
let needs_outer_alias = at_patterns.iter().any(|(_, modifiers, _, _)| {
20142018
modifiers.iter().any(|m| {
@@ -2101,6 +2105,29 @@ pub fn expand_aggregate_with_at(sql: &str) -> AggregateExpandResult {
21012105
return expand_aggregate(&result_sql);
21022106
}
21032107

2108+
// If no GROUP BY, add explicit GROUP BY with dimension columns from original SQL
2109+
// (GROUP BY ALL doesn't work reliably with scalar subqueries mixed with aggregates)
2110+
let result_upper = result_sql.to_uppercase();
2111+
if !result_upper.contains("GROUP BY") && !original_dim_cols.is_empty() {
2112+
// Find insertion point: before ORDER BY, LIMIT, HAVING, or at end
2113+
let insert_pos = ["ORDER BY", "LIMIT", "HAVING", ";"]
2114+
.iter()
2115+
.filter_map(|kw| result_upper.find(kw))
2116+
.min()
2117+
.unwrap_or(result_sql.len());
2118+
2119+
result_sql = format!(
2120+
"{} GROUP BY {}{}",
2121+
result_sql[..insert_pos].trim_end(),
2122+
original_dim_cols.join(", "),
2123+
if insert_pos < result_sql.len() {
2124+
format!(" {}", result_sql[insert_pos..].trim_start())
2125+
} else {
2126+
String::new()
2127+
}
2128+
);
2129+
}
2130+
21042131
AggregateExpandResult {
21052132
had_aggregate: true,
21062133
expanded_sql: result_sql,
@@ -2171,6 +2198,72 @@ fn extract_group_by_columns(sql: &str) -> Vec<String> {
21712198
}
21722199
}
21732200

2201+
// If no GROUP BY, infer dimension columns from SELECT (non-AGGREGATE columns)
2202+
if columns.is_empty() {
2203+
columns = extract_dimension_columns_from_select(sql);
2204+
}
2205+
2206+
columns
2207+
}
2208+
2209+
/// Extract non-AGGREGATE columns from SELECT clause to use as implicit GROUP BY columns
2210+
fn extract_dimension_columns_from_select(sql: &str) -> Vec<String> {
2211+
let sql_upper = sql.to_uppercase();
2212+
let mut columns = Vec::new();
2213+
2214+
// Find SELECT ... FROM
2215+
let select_pos = sql_upper.find("SELECT").unwrap_or(0) + 6;
2216+
let from_pos = sql_upper.find("FROM").unwrap_or(sql.len());
2217+
2218+
if select_pos >= from_pos {
2219+
return columns;
2220+
}
2221+
2222+
let select_content = &sql[select_pos..from_pos];
2223+
2224+
// Split by comma, but be careful about nested parens
2225+
let mut depth = 0;
2226+
let mut current = String::new();
2227+
let mut items = Vec::new();
2228+
2229+
for c in select_content.chars() {
2230+
match c {
2231+
'(' => {
2232+
depth += 1;
2233+
current.push(c);
2234+
}
2235+
')' => {
2236+
depth -= 1;
2237+
current.push(c);
2238+
}
2239+
',' if depth == 0 => {
2240+
items.push(current.trim().to_string());
2241+
current = String::new();
2242+
}
2243+
_ => current.push(c),
2244+
}
2245+
}
2246+
if !current.trim().is_empty() {
2247+
items.push(current.trim().to_string());
2248+
}
2249+
2250+
// Filter out AGGREGATE() calls and extract column names
2251+
for item in items {
2252+
let item_upper = item.to_uppercase();
2253+
if item_upper.contains("AGGREGATE(") {
2254+
continue;
2255+
}
2256+
// Handle "col AS alias" - use the column name, not alias
2257+
let col = if let Some(as_pos) = item_upper.find(" AS ") {
2258+
item[..as_pos].trim()
2259+
} else {
2260+
item.trim()
2261+
};
2262+
if !col.is_empty() {
2263+
columns.push(col.to_string());
2264+
}
2265+
}
2266+
21742267
columns
21752268
}
21762269

@@ -2492,6 +2585,34 @@ FROM orders"#;
24922585
assert!(result.expanded_sql.contains("_inner"));
24932586
}
24942587

2588+
#[test]
2589+
#[serial]
2590+
fn test_expand_aggregate_no_group_by() {
2591+
// Test that queries without GROUP BY get implicit GROUP BY with dimension columns
2592+
clear_measure_views();
2593+
store_measure_view(
2594+
"sales_v",
2595+
vec![ViewMeasure {
2596+
column_name: "revenue".to_string(),
2597+
expression: "SUM(amount)".to_string(),
2598+
}],
2599+
"SELECT year, region, SUM(amount) AS revenue FROM sales GROUP BY ALL",
2600+
);
2601+
2602+
let sql = r#"SELECT year, region, AGGREGATE(revenue) AS revenue, AGGREGATE(revenue) AT (ALL region) AS year_total, AGGREGATE(revenue) / AGGREGATE(revenue) AT (ALL region) AS pct FROM sales_v"#;
2603+
let result = expand_aggregate_with_at(sql);
2604+
2605+
eprintln!("Expanded SQL: {}", result.expanded_sql);
2606+
assert!(result.had_aggregate);
2607+
// Should have GROUP BY with explicit columns (not GROUP BY ALL)
2608+
let upper = result.expanded_sql.to_uppercase();
2609+
assert!(upper.contains("GROUP BY"));
2610+
assert!(upper.contains("YEAR"));
2611+
assert!(upper.contains("REGION"));
2612+
// Should correlate on year (not region, since AT ALL region removes it)
2613+
assert!(result.expanded_sql.contains("_inner.year = _outer.year"));
2614+
}
2615+
24952616
#[test]
24962617
fn test_parse_current_in_expr() {
24972618
// CURRENT year - 1 should become year - 1

0 commit comments

Comments
 (0)