diff --git a/bigframes/core/compile/sqlglot/expressions/comparison_ops.py b/bigframes/core/compile/sqlglot/expressions/comparison_ops.py index d64c7b1d3f..81bc9e0f56 100644 --- a/bigframes/core/compile/sqlglot/expressions/comparison_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/comparison_ops.py @@ -31,17 +31,23 @@ @register_unary_op(ops.IsInOp, pass_op=True) def _(expr: TypedExpr, op: ops.IsInOp) -> sge.Expression: values = [] - is_numeric_expr = dtypes.is_numeric(expr.dtype) + is_numeric_expr = dtypes.is_numeric(expr.dtype, include_bool=False) for value in op.values: - if value is None: + if _is_null(value): continue dtype = dtypes.bigframes_type(type(value)) - if expr.dtype == dtype or is_numeric_expr and dtypes.is_numeric(dtype): + if ( + expr.dtype == dtype + or is_numeric_expr + and dtypes.is_numeric(dtype, include_bool=False) + ): values.append(sge.convert(value)) if op.match_nulls: contains_nulls = any(_is_null(value) for value in op.values) if contains_nulls: + if len(values) == 0: + return sge.Is(this=expr.expr, expression=sge.Null()) return sge.Is(this=expr.expr, expression=sge.Null()) | sge.In( this=expr.expr, expressions=values ) diff --git a/bigframes/core/compile/sqlglot/expressions/geo_ops.py b/bigframes/core/compile/sqlglot/expressions/geo_ops.py index a57b4bc931..9c6ba33ea5 100644 --- a/bigframes/core/compile/sqlglot/expressions/geo_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/geo_ops.py @@ -108,12 +108,12 @@ def _(expr: TypedExpr, op: ops.GeoStSimplifyOp) -> sge.Expression: @register_unary_op(ops.geo_x_op) def _(expr: TypedExpr) -> sge.Expression: - return sge.func("SAFE.ST_X", expr.expr) + return sge.func("ST_X", expr.expr) @register_unary_op(ops.geo_y_op) def _(expr: TypedExpr) -> sge.Expression: - return sge.func("SAFE.ST_Y", expr.expr) + return sge.func("ST_Y", expr.expr) @register_binary_op(ops.GeoStDistanceOp, pass_op=True) diff --git a/bigframes/core/compile/sqlglot/sqlglot_ir.py b/bigframes/core/compile/sqlglot/sqlglot_ir.py index 04176014b0..9445b65e99 100644 --- a/bigframes/core/compile/sqlglot/sqlglot_ir.py +++ b/bigframes/core/compile/sqlglot/sqlglot_ir.py @@ -558,16 +558,15 @@ def _explode_single_column( ) selection = sge.Star(replace=[unnested_column_alias.as_(column)]) - # TODO: "CROSS" if not keep_empty else "LEFT" - # TODO: overlaps_with_parent to replace existing column. new_expr = _select_to_cte( self.expr, sge.to_identifier( next(self.uid_gen.get_uid_stream("bfcte_")), quoted=self.quoted ), ) + # Use LEFT JOIN to preserve rows when unnesting empty arrays. new_expr = new_expr.select(selection, append=False).join( - unnest_expr, join_type="CROSS" + unnest_expr, join_type="LEFT" ) return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen) @@ -621,8 +620,9 @@ def _explode_multiple_columns( next(self.uid_gen.get_uid_stream("bfcte_")), quoted=self.quoted ), ) + # Use LEFT JOIN to preserve rows when unnesting empty arrays. new_expr = new_expr.select(selection, append=False).join( - unnest_expr, join_type="CROSS" + unnest_expr, join_type="LEFT" ) return SQLGlotIR(expr=new_expr, uid_gen=self.uid_gen) diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_is_in/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_is_in/out.sql index 197ed279fa..ec85f060da 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_is_in/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_is_in/out.sql @@ -1,32 +1,35 @@ WITH `bfcte_0` AS ( SELECT + `bool_col`, `float64_col`, `int64_col` FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` ), `bfcte_1` AS ( SELECT *, - COALESCE(`int64_col` IN (1, 2, 3), FALSE) AS `bfcol_2`, - ( - `int64_col` IS NULL - ) OR `int64_col` IN (123456) AS `bfcol_3`, - COALESCE(`int64_col` IN (1.0, 2.0, 3.0), FALSE) AS `bfcol_4`, - FALSE AS `bfcol_5`, - COALESCE(`int64_col` IN (2.5, 3), FALSE) AS `bfcol_6`, + COALESCE(`bool_col` IN (TRUE, FALSE), FALSE) AS `bfcol_3`, + COALESCE(`int64_col` IN (1, 2, 3), FALSE) AS `bfcol_4`, + `int64_col` IS NULL AS `bfcol_5`, + COALESCE(`int64_col` IN (1.0, 2.0, 3.0), FALSE) AS `bfcol_6`, FALSE AS `bfcol_7`, - COALESCE(`int64_col` IN (123456), FALSE) AS `bfcol_8`, + COALESCE(`int64_col` IN (2.5, 3), FALSE) AS `bfcol_8`, + FALSE AS `bfcol_9`, + FALSE AS `bfcol_10`, + COALESCE(`int64_col` IN (123456), FALSE) AS `bfcol_11`, ( `float64_col` IS NULL - ) OR `float64_col` IN (1, 2, 3) AS `bfcol_9` + ) OR `float64_col` IN (1, 2, 3) AS `bfcol_12` FROM `bfcte_0` ) SELECT - `bfcol_2` AS `ints`, - `bfcol_3` AS `ints_w_null`, - `bfcol_4` AS `floats`, - `bfcol_5` AS `strings`, - `bfcol_6` AS `mixed`, - `bfcol_7` AS `empty`, - `bfcol_8` AS `ints_wo_match_nulls`, - `bfcol_9` AS `float_in_ints` + `bfcol_3` AS `bools`, + `bfcol_4` AS `ints`, + `bfcol_5` AS `ints_w_null`, + `bfcol_6` AS `floats`, + `bfcol_7` AS `strings`, + `bfcol_8` AS `mixed`, + `bfcol_9` AS `empty`, + `bfcol_10` AS `empty_wo_match_nulls`, + `bfcol_11` AS `ints_wo_match_nulls`, + `bfcol_12` AS `float_in_ints` FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_geo_ops/test_geo_x/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_geo_ops/test_geo_x/out.sql index 09211270d1..826eb9f209 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_geo_ops/test_geo_x/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_geo_ops/test_geo_x/out.sql @@ -5,7 +5,7 @@ WITH `bfcte_0` AS ( ), `bfcte_1` AS ( SELECT *, - SAFE.ST_X(`geography_col`) AS `bfcol_1` + ST_X(`geography_col`) AS `bfcol_1` FROM `bfcte_0` ) SELECT diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_geo_ops/test_geo_y/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_geo_ops/test_geo_y/out.sql index 625613ae2a..dd411820b2 100644 --- a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_geo_ops/test_geo_y/out.sql +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_geo_ops/test_geo_y/out.sql @@ -5,7 +5,7 @@ WITH `bfcte_0` AS ( ), `bfcte_1` AS ( SELECT *, - SAFE.ST_Y(`geography_col`) AS `bfcol_1` + ST_Y(`geography_col`) AS `bfcol_1` FROM `bfcte_0` ) SELECT diff --git a/tests/unit/core/compile/sqlglot/expressions/test_comparison_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_comparison_ops.py index 20dd6c5ca6..ea94bcae56 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_comparison_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_comparison_ops.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pandas as pd import pytest from bigframes import operations as ops @@ -22,18 +23,23 @@ def test_is_in(scalar_types_df: bpd.DataFrame, snapshot): + bool_col = "bool_col" int_col = "int64_col" float_col = "float64_col" - bf_df = scalar_types_df[[int_col, float_col]] + bf_df = scalar_types_df[[bool_col, int_col, float_col]] ops_map = { + "bools": ops.IsInOp(values=(True, False)).as_expr(bool_col), "ints": ops.IsInOp(values=(1, 2, 3)).as_expr(int_col), - "ints_w_null": ops.IsInOp(values=(None, 123456)).as_expr(int_col), + "ints_w_null": ops.IsInOp(values=(None, pd.NA)).as_expr(int_col), "floats": ops.IsInOp(values=(1.0, 2.0, 3.0), match_nulls=False).as_expr( int_col ), "strings": ops.IsInOp(values=("1.0", "2.0")).as_expr(int_col), "mixed": ops.IsInOp(values=("1.0", 2.5, 3)).as_expr(int_col), "empty": ops.IsInOp(values=()).as_expr(int_col), + "empty_wo_match_nulls": ops.IsInOp(values=(), match_nulls=False).as_expr( + int_col + ), "ints_wo_match_nulls": ops.IsInOp( values=(None, 123456), match_nulls=False ).as_expr(int_col), diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_dataframe/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_dataframe/out.sql index e594b67669..5d9019439f 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_dataframe/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_dataframe/out.sql @@ -9,7 +9,7 @@ WITH `bfcte_0` AS ( * REPLACE (`int_list_col`[SAFE_OFFSET(`bfcol_13`)] AS `int_list_col`, `string_list_col`[SAFE_OFFSET(`bfcol_13`)] AS `string_list_col`) FROM `bfcte_0` - CROSS JOIN UNNEST(GENERATE_ARRAY(0, LEAST(ARRAY_LENGTH(`int_list_col`) - 1, ARRAY_LENGTH(`string_list_col`) - 1))) AS `bfcol_13` WITH OFFSET AS `bfcol_7` + LEFT JOIN UNNEST(GENERATE_ARRAY(0, LEAST(ARRAY_LENGTH(`int_list_col`) - 1, ARRAY_LENGTH(`string_list_col`) - 1))) AS `bfcol_13` WITH OFFSET AS `bfcol_7` ) SELECT `rowindex`, diff --git a/tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_series/out.sql b/tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_series/out.sql index 5af0aa0092..8ba4559da8 100644 --- a/tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_series/out.sql +++ b/tests/unit/core/compile/sqlglot/snapshots/test_compile_explode/test_compile_explode_series/out.sql @@ -8,7 +8,7 @@ WITH `bfcte_0` AS ( * REPLACE (`bfcol_8` AS `int_list_col`) FROM `bfcte_0` - CROSS JOIN UNNEST(`int_list_col`) AS `bfcol_8` WITH OFFSET AS `bfcol_4` + LEFT JOIN UNNEST(`int_list_col`) AS `bfcol_8` WITH OFFSET AS `bfcol_4` ) SELECT `rowindex`,