Skip to content

Commit aa2f73a

Browse files
authored
feat: add bigframes.bigquery.sql_scalar() to apply SQL syntax on Series objects (#1293)
* feat: add `bigframes.bigquery.sql_scalar()` to apply SQL syntax on Series objects * add SqlScalarOp * fix ibis compilation and add a test * remove error from notebook * fix mypy * simplify to use literals in dry run sql * add support for all types in scalars df * fix doctest * fix unit tests and mypy * add unit tests for literal to SQL
1 parent 2de89c3 commit aa2f73a

File tree

13 files changed

+581
-87
lines changed

13 files changed

+581
-87
lines changed

bigframes/bigquery/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
parse_json,
3131
)
3232
from bigframes.bigquery._operations.search import create_vector_index, vector_search
33+
from bigframes.bigquery._operations.sql import sql_scalar
3334
from bigframes.bigquery._operations.struct import struct
3435

3536
__all__ = [
@@ -48,6 +49,8 @@
4849
# search ops
4950
"create_vector_index",
5051
"vector_search",
52+
# sql ops
53+
"sql_scalar",
5154
# struct ops
5255
"struct",
5356
]

bigframes/bigquery/_operations/sql.py

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""SQL escape hatch features."""
16+
17+
from __future__ import annotations
18+
19+
from typing import Sequence
20+
21+
import google.cloud.bigquery
22+
23+
import bigframes.core.sql
24+
import bigframes.dataframe
25+
import bigframes.dtypes
26+
import bigframes.operations
27+
import bigframes.series
28+
29+
30+
def sql_scalar(
31+
sql_template: str,
32+
columns: Sequence[bigframes.series.Series],
33+
) -> bigframes.series.Series:
34+
"""Create a Series from a SQL template.
35+
36+
**Examples:**
37+
38+
>>> import bigframes.pandas as bpd
39+
>>> import bigframes.bigquery as bbq
40+
>>> import pandas as pd
41+
>>> import pyarrow as pa
42+
>>> bpd.options.display.progress_bar = None
43+
44+
>>> s = bpd.Series(["1.5", "2.5", "3.5"])
45+
>>> s = s.astype(pd.ArrowDtype(pa.decimal128(38, 9)))
46+
>>> bbq.sql_scalar("ROUND({0}, 0, 'ROUND_HALF_EVEN')", [s])
47+
0 2.000000000
48+
1 2.000000000
49+
2 4.000000000
50+
dtype: decimal128(38, 9)[pyarrow]
51+
52+
Args:
53+
sql_template (str):
54+
A SQL format string with Python-style {0} placeholders for each of
55+
the Series objects in ``columns``.
56+
columns (Sequence[bigframes.pandas.Series]):
57+
Series objects representing the column inputs to the
58+
``sql_template``. Must contain at least one Series.
59+
60+
Returns:
61+
bigframes.pandas.Series:
62+
A Series with the SQL applied.
63+
64+
Raises:
65+
ValueError: If ``columns`` is empty.
66+
"""
67+
if len(columns) == 0:
68+
raise ValueError("Must provide at least one column in columns")
69+
70+
# To integrate this into our expression trees, we need to get the output
71+
# type, so we do some manual compilation and a dry run query to get that.
72+
# Another benefit of this is that if there is a syntax error in the SQL
73+
# template, then this will fail with an error earlier in the process,
74+
# aiding users in debugging.
75+
base_series = columns[0]
76+
literals = [
77+
bigframes.dtypes.bigframes_dtype_to_literal(column.dtype) for column in columns
78+
]
79+
literals_sql = [bigframes.core.sql.simple_literal(literal) for literal in literals]
80+
81+
# Use the executor directly, because we want the original column IDs, not
82+
# the user-friendly column names that block.to_sql_query() would produce.
83+
select_sql = sql_template.format(*literals_sql)
84+
dry_run_sql = f"SELECT {select_sql}"
85+
bqclient = base_series._session.bqclient
86+
job = bqclient.query(
87+
dry_run_sql, job_config=google.cloud.bigquery.QueryJobConfig(dry_run=True)
88+
)
89+
_, output_type = bigframes.dtypes.convert_schema_field(job.schema[0])
90+
91+
op = bigframes.operations.SqlScalarOp(
92+
_output_type=output_type, sql_template=sql_template
93+
)
94+
return base_series._apply_nary_op(op, columns[1:])

bigframes/core/compile/scalar_op_compiler.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1845,6 +1845,17 @@ def nary_remote_function_op_impl(
18451845
return result
18461846

18471847

1848+
@scalar_op_compiler.register_nary_op(ops.SqlScalarOp, pass_op=True)
1849+
def sql_scalar_op_impl(*operands: ibis_types.Value, op: ops.SqlScalarOp):
1850+
return ibis_generic.SqlScalar(
1851+
op.sql_template,
1852+
values=tuple(typing.cast(ibis_generic.Value, expr.op()) for expr in operands),
1853+
output_type=bigframes.core.compile.ibis_types.bigframes_dtype_to_ibis_dtype(
1854+
op.output_type()
1855+
),
1856+
).to_expr()
1857+
1858+
18481859
@scalar_op_compiler.register_nary_op(ops.StructOp, pass_op=True)
18491860
def struct_op_impl(
18501861
*values: ibis_types.Value, op: ops.StructOp

bigframes/core/sql.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,13 @@
1818
"""
1919

2020
import datetime
21+
import decimal
2122
import json
2223
import math
2324
from typing import cast, Collection, Iterable, Mapping, Optional, TYPE_CHECKING, Union
2425

26+
import shapely # type: ignore
27+
2528
import bigframes.core.compile.googlesql as googlesql
2629

2730
if TYPE_CHECKING:
@@ -31,12 +34,16 @@
3134

3235

3336
### Writing SQL Values (literals, column references, table references, etc.)
34-
def simple_literal(value: str | int | bool | float | datetime.datetime):
37+
def simple_literal(value: bytes | str | int | bool | float | datetime.datetime | None):
3538
"""Return quoted input string."""
3639
# https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#literals
37-
if isinstance(value, str):
40+
if value is None:
41+
return "NULL"
42+
elif isinstance(value, str):
3843
# Single quoting seems to work nicer with ibis than double quoting
3944
return f"'{googlesql._escape_chars(value)}'"
45+
elif isinstance(value, bytes):
46+
return repr(value)
4047
elif isinstance(value, (bool, int)):
4148
return str(value)
4249
elif isinstance(value, float):
@@ -48,8 +55,21 @@ def simple_literal(value: str | int | bool | float | datetime.datetime):
4855
if value == -math.inf:
4956
return 'CAST("-inf" as FLOAT)'
5057
return str(value)
51-
if isinstance(value, datetime.datetime):
52-
return f"TIMESTAMP('{value.isoformat()}')"
58+
# Check datetime first as it is a subclass of date
59+
elif isinstance(value, datetime.datetime):
60+
if value.tzinfo is None:
61+
return f"DATETIME('{value.isoformat()}')"
62+
else:
63+
return f"TIMESTAMP('{value.isoformat()}')"
64+
elif isinstance(value, datetime.date):
65+
return f"DATE('{value.isoformat()}')"
66+
elif isinstance(value, datetime.time):
67+
return f"TIME(DATETIME('1970-01-01 {value.isoformat()}'))"
68+
elif isinstance(value, shapely.Geometry):
69+
return f"ST_GEOGFROMTEXT({simple_literal(shapely.to_wkt(value))})"
70+
elif isinstance(value, decimal.Decimal):
71+
# TODO: disambiguate BIGNUMERIC based on scale and/or precision
72+
return f"CAST('{str(value)}' AS NUMERIC)"
5373
else:
5474
raise ValueError(f"Cannot produce literal for {value}")
5575

bigframes/dtypes.py

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,15 @@
1818
import datetime
1919
import decimal
2020
import typing
21-
from typing import Dict, List, Literal, Union
21+
from typing import Any, Dict, List, Literal, Union
2222

2323
import bigframes_vendored.constants as constants
2424
import geopandas as gpd # type: ignore
2525
import google.cloud.bigquery
2626
import numpy as np
2727
import pandas as pd
2828
import pyarrow as pa
29+
import shapely # type: ignore
2930

3031
# Type hints for Pandas dtypes supported by BigQuery DataFrame
3132
Dtype = Union[
@@ -450,6 +451,74 @@ def bigframes_dtype_to_arrow_dtype(
450451
)
451452

452453

454+
def bigframes_dtype_to_literal(
455+
bigframes_dtype: Dtype,
456+
) -> Any:
457+
"""Create a representative literal value for a bigframes dtype.
458+
459+
The inverse of infer_literal_type().
460+
"""
461+
if isinstance(bigframes_dtype, pd.ArrowDtype):
462+
arrow_type = bigframes_dtype.pyarrow_dtype
463+
return arrow_type_to_literal(arrow_type)
464+
465+
if isinstance(bigframes_dtype, pd.Float64Dtype):
466+
return 1.0
467+
if isinstance(bigframes_dtype, pd.Int64Dtype):
468+
return 1
469+
if isinstance(bigframes_dtype, pd.BooleanDtype):
470+
return True
471+
if isinstance(bigframes_dtype, pd.StringDtype):
472+
return "string"
473+
if isinstance(bigframes_dtype, gpd.array.GeometryDtype):
474+
return shapely.Point((0, 0))
475+
476+
raise ValueError(
477+
f"No literal conversion for {bigframes_dtype}. {constants.FEEDBACK_LINK}"
478+
)
479+
480+
481+
def arrow_type_to_literal(
482+
arrow_type: pa.DataType,
483+
) -> Any:
484+
"""Create a representative literal value for an arrow type."""
485+
if pa.types.is_list(arrow_type):
486+
return [arrow_type_to_literal(arrow_type.value_type)]
487+
if pa.types.is_struct(arrow_type):
488+
return {
489+
field.name: arrow_type_to_literal(field.type) for field in arrow_type.fields
490+
}
491+
if pa.types.is_string(arrow_type):
492+
return "string"
493+
if pa.types.is_binary(arrow_type):
494+
return b"bytes"
495+
if pa.types.is_floating(arrow_type):
496+
return 1.0
497+
if pa.types.is_integer(arrow_type):
498+
return 1
499+
if pa.types.is_boolean(arrow_type):
500+
return True
501+
if pa.types.is_date(arrow_type):
502+
return datetime.date(2025, 1, 1)
503+
if pa.types.is_timestamp(arrow_type):
504+
return datetime.datetime(
505+
2025,
506+
1,
507+
1,
508+
1,
509+
1,
510+
tzinfo=datetime.timezone.utc if arrow_type.tz is not None else None,
511+
)
512+
if pa.types.is_decimal(arrow_type):
513+
return decimal.Decimal("1.0")
514+
if pa.types.is_time(arrow_type):
515+
return datetime.time(1, 1, 1)
516+
517+
raise ValueError(
518+
f"No literal conversion for {arrow_type}. {constants.FEEDBACK_LINK}"
519+
)
520+
521+
453522
def infer_literal_type(literal) -> typing.Optional[Dtype]:
454523
# Maybe also normalize literal to canonical python representation to remove this burden from compilers?
455524
if pd.api.types.is_list_like(literal):

bigframes/operations/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@
7878
minimum_op,
7979
notnull_op,
8080
RowKey,
81+
SqlScalarOp,
8182
where_op,
8283
)
8384
from bigframes.operations.geo_ops import geo_x_op, geo_y_op
@@ -191,6 +192,7 @@
191192
"minimum_op",
192193
"notnull_op",
193194
"RowKey",
195+
"SqlScalarOp",
194196
"where_op",
195197
# String ops
196198
"capitalize_op",

bigframes/operations/generic_ops.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,3 +160,15 @@ def is_bijective(self) -> bool:
160160
@property
161161
def deterministic(self) -> bool:
162162
return False
163+
164+
165+
@dataclasses.dataclass(frozen=True)
166+
class SqlScalarOp(base_ops.NaryOp):
167+
"""An escape to SQL, representing a single column."""
168+
169+
name: typing.ClassVar[str] = "sql_scalar"
170+
_output_type: dtypes.ExpressionType
171+
sql_template: str
172+
173+
def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
174+
return self._output_type

0 commit comments

Comments
 (0)