Skip to content

Commit 3abc02e

Browse files
authored
chore: enable order_by and limit for new compiler (#1815)
* chore: enable order_by and limit for new compiler * fix tests after merge main
1 parent 72076c7 commit 3abc02e

File tree

11 files changed

+228
-49
lines changed

11 files changed

+228
-49
lines changed

bigframes/core/compile/sqlglot/compiler.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,9 +125,25 @@ def _compile_result_node(self, root: nodes.ResultNode) -> str:
125125
(name, scalar_compiler.compile_scalar_expression(ref))
126126
for ref, name in root.output_cols
127127
)
128-
sqlglot_ir = sqlglot_ir.select(selected_cols)
128+
# Skip squashing selections to ensure the right ordering and limit keys
129+
sqlglot_ir = sqlglot_ir.select(selected_cols, squash_selections=False)
130+
131+
if root.order_by is not None:
132+
ordering_cols = tuple(
133+
sge.Ordered(
134+
this=scalar_compiler.compile_scalar_expression(
135+
ordering.scalar_expression
136+
),
137+
desc=ordering.direction.is_ascending is False,
138+
nulls_first=ordering.na_last is False,
139+
)
140+
for ordering in root.order_by.all_ordering_columns
141+
)
142+
sqlglot_ir = sqlglot_ir.order_by(ordering_cols)
143+
144+
if root.limit is not None:
145+
sqlglot_ir = sqlglot_ir.limit(root.limit)
129146

130-
# TODO: add order_by, limit to sqlglot_expr
131147
return sqlglot_ir.sql
132148

133149
@functools.lru_cache(maxsize=5000)

bigframes/core/compile/sqlglot/sqlglot_ir.py

Lines changed: 36 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
from bigframes.core import guid
2929
import bigframes.core.compile.sqlglot.sqlglot_types as sgt
3030
import bigframes.core.local_data as local_data
31-
import bigframes.core.schema as schemata
31+
import bigframes.core.schema as bf_schema
3232

3333
# shapely.wkt.dumps was moved to shapely.io.to_wkt in 2.0.
3434
try:
@@ -67,7 +67,7 @@ def sql(self) -> str:
6767
def from_pyarrow(
6868
cls,
6969
pa_table: pa.Table,
70-
schema: schemata.ArraySchema,
70+
schema: bf_schema.ArraySchema,
7171
uid_gen: guid.SequentialUIDGenerator,
7272
) -> SQLGlotIR:
7373
"""Builds SQLGlot expression from pyarrow table."""
@@ -203,6 +203,7 @@ def from_union(
203203
def select(
204204
self,
205205
selected_cols: tuple[tuple[str, sge.Expression], ...],
206+
squash_selections: bool = True,
206207
) -> SQLGlotIR:
207208
selections = [
208209
sge.Alias(
@@ -211,15 +212,39 @@ def select(
211212
)
212213
for id, expr in selected_cols
213214
]
214-
# Attempts to simplify selected columns when the original and new column
215-
# names are simply aliases of each other.
216-
squashed_selections = _squash_selections(self.expr.expressions, selections)
217-
if squashed_selections != []:
218-
new_expr = self.expr.select(*squashed_selections, append=False)
219-
return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen)
215+
216+
# If squashing is enabled, we try to simplify the selections
217+
# by checking if the new selections are simply aliases of the
218+
# original columns.
219+
if squash_selections:
220+
new_selections = _squash_selections(self.expr.expressions, selections)
221+
if new_selections != []:
222+
new_expr = self.expr.select(*new_selections, append=False)
223+
return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen)
224+
225+
new_expr = self._encapsulate_as_cte().select(*selections, append=False)
226+
return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen)
227+
228+
def order_by(
229+
self,
230+
ordering: tuple[sge.Ordered, ...],
231+
) -> SQLGlotIR:
232+
"""Adds ORDER BY clause to the query."""
233+
if len(ordering) == 0:
234+
return SQLGlotIR(expr=self.expr.copy(), uid_gen=self.uid_gen)
235+
new_expr = self.expr.order_by(*ordering)
236+
return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen)
237+
238+
def limit(
239+
self,
240+
limit: int | None,
241+
) -> SQLGlotIR:
242+
"""Adds LIMIT clause to the query."""
243+
if limit is not None:
244+
new_expr = self.expr.limit(limit)
220245
else:
221-
new_expr = self._encapsulate_as_cte().select(*selections, append=False)
222-
return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen)
246+
new_expr = self.expr.copy()
247+
return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen)
223248

224249
def project(
225250
self,
@@ -342,6 +367,7 @@ def _squash_selections(
342367
old_expr: list[sge.Expression], new_expr: list[sge.Alias]
343368
) -> list[sge.Alias]:
344369
"""
370+
TODO: Reanble this function to optimize the SQL.
345371
Simplifies the select column expressions if existing (old_expr) and
346372
new (new_expr) selected columns are both simple aliases of column definitions.
347373

tests/unit/core/compile/sqlglot/snapshots/test_compile_concat/test_compile_concat/out.sql

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,4 +104,7 @@ SELECT
104104
`bfcol_47` AS `rowindex_1`,
105105
`bfcol_48` AS `int64_col`,
106106
`bfcol_49` AS `string_col`
107-
FROM `bfcte_12`
107+
FROM `bfcte_12`
108+
ORDER BY
109+
`bfcol_50` ASC NULLS LAST,
110+
`bfcol_51` ASC NULLS LAST

tests/unit/core/compile/sqlglot/snapshots/test_compile_projection/test_compile_projection/out.sql

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,19 @@ WITH `bfcte_0` AS (
1515
`bfcol_4` AS `bfcol_8`,
1616
`bfcol_1` + 1 AS `bfcol_9`
1717
FROM `bfcte_0`
18+
), `bfcte_2` AS (
19+
SELECT
20+
`bfcol_5` AS `bfcol_10`,
21+
`bfcol_9` AS `bfcol_11`,
22+
`bfcol_6` AS `bfcol_12`,
23+
`bfcol_7` AS `bfcol_13`,
24+
`bfcol_8` AS `bfcol_14`
25+
FROM `bfcte_1`
1826
)
1927
SELECT
20-
`bfcol_5` AS `rowindex`,
21-
`bfcol_9` AS `int64_col`,
22-
`bfcol_6` AS `string_col`,
23-
`bfcol_7` AS `float64_col`,
24-
`bfcol_8` AS `bool_col`
25-
FROM `bfcte_1`
28+
`bfcol_10` AS `rowindex`,
29+
`bfcol_11` AS `int64_col`,
30+
`bfcol_12` AS `string_col`,
31+
`bfcol_13` AS `float64_col`,
32+
`bfcol_14` AS `bool_col`
33+
FROM `bfcte_2`

tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal/out.sql

Lines changed: 37 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -155,21 +155,42 @@ WITH `bfcte_0` AS (
155155
CAST(NULL AS TIMESTAMP),
156156
8
157157
)])
158+
), `bfcte_1` AS (
159+
SELECT
160+
`bfcol_0` AS `bfcol_16`,
161+
`bfcol_1` AS `bfcol_17`,
162+
`bfcol_2` AS `bfcol_18`,
163+
`bfcol_3` AS `bfcol_19`,
164+
`bfcol_4` AS `bfcol_20`,
165+
`bfcol_5` AS `bfcol_21`,
166+
`bfcol_6` AS `bfcol_22`,
167+
`bfcol_7` AS `bfcol_23`,
168+
`bfcol_8` AS `bfcol_24`,
169+
`bfcol_9` AS `bfcol_25`,
170+
`bfcol_10` AS `bfcol_26`,
171+
`bfcol_11` AS `bfcol_27`,
172+
`bfcol_12` AS `bfcol_28`,
173+
`bfcol_13` AS `bfcol_29`,
174+
`bfcol_14` AS `bfcol_30`,
175+
`bfcol_15` AS `bfcol_31`
176+
FROM `bfcte_0`
158177
)
159178
SELECT
160-
`bfcol_0` AS `rowindex`,
161-
`bfcol_1` AS `bool_col`,
162-
`bfcol_2` AS `bytes_col`,
163-
`bfcol_3` AS `date_col`,
164-
`bfcol_4` AS `datetime_col`,
165-
`bfcol_5` AS `geography_col`,
166-
`bfcol_6` AS `int64_col`,
167-
`bfcol_7` AS `int64_too`,
168-
`bfcol_8` AS `numeric_col`,
169-
`bfcol_9` AS `float64_col`,
170-
`bfcol_10` AS `rowindex_1`,
171-
`bfcol_11` AS `rowindex_2`,
172-
`bfcol_12` AS `string_col`,
173-
`bfcol_13` AS `time_col`,
174-
`bfcol_14` AS `timestamp_col`
175-
FROM `bfcte_0`
179+
`bfcol_16` AS `rowindex`,
180+
`bfcol_17` AS `bool_col`,
181+
`bfcol_18` AS `bytes_col`,
182+
`bfcol_19` AS `date_col`,
183+
`bfcol_20` AS `datetime_col`,
184+
`bfcol_21` AS `geography_col`,
185+
`bfcol_22` AS `int64_col`,
186+
`bfcol_23` AS `int64_too`,
187+
`bfcol_24` AS `numeric_col`,
188+
`bfcol_25` AS `float64_col`,
189+
`bfcol_26` AS `rowindex_1`,
190+
`bfcol_27` AS `rowindex_2`,
191+
`bfcol_28` AS `string_col`,
192+
`bfcol_29` AS `time_col`,
193+
`bfcol_30` AS `timestamp_col`
194+
FROM `bfcte_1`
195+
ORDER BY
196+
`bfcol_31` ASC NULLS LAST

tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_json_df/out.sql

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,14 @@ WITH `bfcte_0` AS (
22
SELECT
33
*
44
FROM UNNEST(ARRAY<STRUCT<`bfcol_0` JSON, `bfcol_1` INT64>>[STRUCT(PARSE_JSON('null'), 0), STRUCT(PARSE_JSON('true'), 1), STRUCT(PARSE_JSON('100'), 2), STRUCT(PARSE_JSON('0.98'), 3), STRUCT(PARSE_JSON('"a string"'), 4), STRUCT(PARSE_JSON('[]'), 5), STRUCT(PARSE_JSON('[1,2,3]'), 6), STRUCT(PARSE_JSON('[{"a":1},{"a":2},{"a":null},{}]'), 7), STRUCT(PARSE_JSON('"100"'), 8), STRUCT(PARSE_JSON('{"date":"2024-07-16"}'), 9), STRUCT(PARSE_JSON('{"int_value":2,"null_filed":null}'), 10), STRUCT(PARSE_JSON('{"list_data":[10,20,30]}'), 11)])
5+
), `bfcte_1` AS (
6+
SELECT
7+
`bfcol_0` AS `bfcol_2`,
8+
`bfcol_1` AS `bfcol_3`
9+
FROM `bfcte_0`
510
)
611
SELECT
7-
`bfcol_0` AS `json_col`
8-
FROM `bfcte_0`
12+
`bfcol_2` AS `json_col`
13+
FROM `bfcte_1`
14+
ORDER BY
15+
`bfcol_3` ASC NULLS LAST

tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_lists_df/out.sql

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -32,14 +32,28 @@ WITH `bfcte_0` AS (
3232
['', 'a'],
3333
2
3434
)])
35+
), `bfcte_1` AS (
36+
SELECT
37+
`bfcol_0` AS `bfcol_9`,
38+
`bfcol_1` AS `bfcol_10`,
39+
`bfcol_2` AS `bfcol_11`,
40+
`bfcol_3` AS `bfcol_12`,
41+
`bfcol_4` AS `bfcol_13`,
42+
`bfcol_5` AS `bfcol_14`,
43+
`bfcol_6` AS `bfcol_15`,
44+
`bfcol_7` AS `bfcol_16`,
45+
`bfcol_8` AS `bfcol_17`
46+
FROM `bfcte_0`
3547
)
3648
SELECT
37-
`bfcol_0` AS `rowindex`,
38-
`bfcol_1` AS `int_list_col`,
39-
`bfcol_2` AS `bool_list_col`,
40-
`bfcol_3` AS `float_list_col`,
41-
`bfcol_4` AS `date_list_col`,
42-
`bfcol_5` AS `date_time_list_col`,
43-
`bfcol_6` AS `numeric_list_col`,
44-
`bfcol_7` AS `string_list_col`
45-
FROM `bfcte_0`
49+
`bfcol_9` AS `rowindex`,
50+
`bfcol_10` AS `int_list_col`,
51+
`bfcol_11` AS `bool_list_col`,
52+
`bfcol_12` AS `float_list_col`,
53+
`bfcol_13` AS `date_list_col`,
54+
`bfcol_14` AS `date_time_list_col`,
55+
`bfcol_15` AS `numeric_list_col`,
56+
`bfcol_16` AS `string_list_col`
57+
FROM `bfcte_1`
58+
ORDER BY
59+
`bfcol_17` ASC NULLS LAST

tests/unit/core/compile/sqlglot/snapshots/test_compile_readlocal/test_compile_readlocal_w_structs_df/out.sql

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,16 @@ WITH `bfcte_0` AS (
1818
),
1919
1
2020
)])
21+
), `bfcte_1` AS (
22+
SELECT
23+
`bfcol_0` AS `bfcol_3`,
24+
`bfcol_1` AS `bfcol_4`,
25+
`bfcol_2` AS `bfcol_5`
26+
FROM `bfcte_0`
2127
)
2228
SELECT
23-
`bfcol_0` AS `id`,
24-
`bfcol_1` AS `person`
25-
FROM `bfcte_0`
29+
`bfcol_3` AS `id`,
30+
`bfcol_4` AS `person`
31+
FROM `bfcte_1`
32+
ORDER BY
33+
`bfcol_5` ASC NULLS LAST
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
WITH `bfcte_0` AS (
2+
SELECT
3+
`rowindex` AS `bfcol_0`,
4+
`int64_col` AS `bfcol_1`,
5+
`string_col` AS `bfcol_2`,
6+
`float64_col` AS `bfcol_3`,
7+
`bool_col` AS `bfcol_4`
8+
FROM `test-project`.`test_dataset`.`test_table`
9+
), `bfcte_1` AS (
10+
SELECT
11+
*,
12+
`bfcol_1` AS `bfcol_5`
13+
FROM `bfcte_0`
14+
)
15+
SELECT
16+
`bfcol_0` AS `rowindex`,
17+
`bfcol_1` AS `int64_col`,
18+
`bfcol_2` AS `string_col`,
19+
`bfcol_3` AS `float64_col`,
20+
`bfcol_4` AS `bool_col`
21+
FROM `bfcte_1`
22+
ORDER BY
23+
`bfcol_5` ASC NULLS LAST
24+
LIMIT 10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
WITH `bfcte_0` AS (
2+
SELECT
3+
`rowindex` AS `bfcol_0`,
4+
`int64_col` AS `bfcol_1`,
5+
`string_col` AS `bfcol_2`,
6+
`float64_col` AS `bfcol_3`,
7+
`bool_col` AS `bfcol_4`
8+
FROM `test-project`.`test_dataset`.`test_table`
9+
), `bfcte_1` AS (
10+
SELECT
11+
`bfcol_0` AS `bfcol_5`,
12+
`bfcol_1` AS `bfcol_6`,
13+
`bfcol_2` AS `bfcol_7`,
14+
`bfcol_3` AS `bfcol_8`,
15+
`bfcol_4` AS `bfcol_9`
16+
FROM `bfcte_0`
17+
), `bfcte_2` AS (
18+
SELECT
19+
*,
20+
`bfcol_5` AS `bfcol_10`
21+
FROM `bfcte_1`
22+
), `bfcte_3` AS (
23+
SELECT
24+
`bfcol_5` AS `bfcol_11`,
25+
`bfcol_6` AS `bfcol_12`,
26+
`bfcol_7` AS `bfcol_13`,
27+
`bfcol_8` AS `bfcol_14`,
28+
`bfcol_9` AS `bfcol_15`,
29+
`bfcol_10` AS `bfcol_16`
30+
FROM `bfcte_2`
31+
)
32+
SELECT
33+
`bfcol_11` AS `rowindex`,
34+
`bfcol_12` AS `int64_col`,
35+
`bfcol_13` AS `string_col`,
36+
`bfcol_14` AS `float64_col`,
37+
`bfcol_15` AS `bool_col`
38+
FROM `bfcte_3`
39+
ORDER BY
40+
`bfcol_16` ASC NULLS LAST

0 commit comments

Comments
 (0)