Skip to content

Commit 4ef8bac

Browse files
authored
feat: support json_extract_string_array in the bigquery module (#1131)
1 parent 07bf2d4 commit 4ef8bac

File tree

5 files changed

+188
-24
lines changed

5 files changed

+188
-24
lines changed

bigframes/bigquery/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from bigframes.bigquery._operations.json import (
2626
json_extract,
2727
json_extract_array,
28+
json_extract_string_array,
2829
json_set,
2930
)
3031
from bigframes.bigquery._operations.search import create_vector_index, vector_search
@@ -37,6 +38,7 @@
3738
"json_set",
3839
"json_extract",
3940
"json_extract_array",
41+
"json_extract_string_array",
4042
"approx_top_count",
4143
"struct",
4244
"create_vector_index",

bigframes/bigquery/_operations/json.py

Lines changed: 104 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,17 @@
2121

2222
from __future__ import annotations
2323

24-
from typing import Any, Sequence, Tuple
24+
from typing import Any, cast, Optional, Sequence, Tuple, Union
2525

26+
import bigframes.dtypes
2627
import bigframes.operations as ops
2728
import bigframes.series as series
2829

30+
from . import array
31+
2932

3033
def json_set(
31-
series: series.Series,
34+
input: series.Series,
3235
json_path_value_pairs: Sequence[Tuple[str, Any]],
3336
) -> series.Series:
3437
"""Produces a new JSON value within a Series by inserting or replacing values at
@@ -47,7 +50,7 @@ def json_set(
4750
Name: data, dtype: string
4851
4952
Args:
50-
series (bigframes.series.Series):
53+
input (bigframes.series.Series):
5154
The Series containing JSON data (as native JSON objects or JSON-formatted strings).
5255
json_path_value_pairs (Sequence[Tuple[str, Any]]):
5356
Pairs of JSON path and the new value to insert/replace.
@@ -59,6 +62,7 @@ def json_set(
5962
# SQLGlot parser does not support the "create_if_missing => true" syntax, so
6063
# create_if_missing is not currently implemented.
6164

65+
result = input
6266
for json_path_value_pair in json_path_value_pairs:
6367
if len(json_path_value_pair) != 2:
6468
raise ValueError(
@@ -67,14 +71,14 @@ def json_set(
6771
)
6872

6973
json_path, json_value = json_path_value_pair
70-
series = series._apply_binary_op(
74+
result = result._apply_binary_op(
7175
json_value, ops.JSONSet(json_path=json_path), alignment="left"
7276
)
73-
return series
77+
return result
7478

7579

7680
def json_extract(
77-
series: series.Series,
81+
input: series.Series,
7882
json_path: str,
7983
) -> series.Series:
8084
"""Extracts a JSON value and converts it to a SQL JSON-formatted `STRING` or `JSON`
@@ -93,24 +97,24 @@ def json_extract(
9397
dtype: string
9498
9599
Args:
96-
series (bigframes.series.Series):
100+
input (bigframes.series.Series):
97101
The Series containing JSON data (as native JSON objects or JSON-formatted strings).
98102
json_path (str):
99103
The JSON path identifying the data that you want to obtain from the input.
100104
101105
Returns:
102106
bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING.
103107
"""
104-
return series._apply_unary_op(ops.JSONExtract(json_path=json_path))
108+
return input._apply_unary_op(ops.JSONExtract(json_path=json_path))
105109

106110

107111
def json_extract_array(
108-
series: series.Series,
112+
input: series.Series,
109113
json_path: str = "$",
110114
) -> series.Series:
111-
"""Extracts a JSON array and converts it to a SQL array of JSON-formatted `STRING` or `JSON`
112-
values. This function uses single quotes and brackets to escape invalid JSONPath
113-
characters in JSON keys.
115+
"""Extracts a JSON array and converts it to a SQL array of JSON-formatted
116+
`STRING` or `JSON` values. This function uses single quotes and brackets to
117+
escape invalid JSONPath characters in JSON keys.
114118
115119
**Examples:**
116120
@@ -124,13 +128,98 @@ def json_extract_array(
124128
1 ['4' '5']
125129
dtype: list<item: string>[pyarrow]
126130
131+
>>> s = bpd.Series([
132+
... '{"fruits": [{"name": "apple"}, {"name": "cherry"}]}',
133+
... '{"fruits": [{"name": "guava"}, {"name": "grapes"}]}'
134+
... ])
135+
>>> bbq.json_extract_array(s, "$.fruits")
136+
0 ['{"name":"apple"}' '{"name":"cherry"}']
137+
1 ['{"name":"guava"}' '{"name":"grapes"}']
138+
dtype: list<item: string>[pyarrow]
139+
140+
>>> s = bpd.Series([
141+
... '{"fruits": {"color": "red", "names": ["apple","cherry"]}}',
142+
... '{"fruits": {"color": "green", "names": ["guava", "grapes"]}}'
143+
... ])
144+
>>> bbq.json_extract_array(s, "$.fruits.names")
145+
0 ['"apple"' '"cherry"']
146+
1 ['"guava"' '"grapes"']
147+
dtype: list<item: string>[pyarrow]
148+
127149
Args:
128-
series (bigframes.series.Series):
150+
input (bigframes.series.Series):
129151
The Series containing JSON data (as native JSON objects or JSON-formatted strings).
130152
json_path (str):
131153
The JSON path identifying the data that you want to obtain from the input.
132154
133155
Returns:
134-
bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING.
156+
bigframes.series.Series: A new Series with the parsed arrays from the input.
135157
"""
136-
return series._apply_unary_op(ops.JSONExtractArray(json_path=json_path))
158+
return input._apply_unary_op(ops.JSONExtractArray(json_path=json_path))
159+
160+
161+
def json_extract_string_array(
162+
input: series.Series,
163+
json_path: str = "$",
164+
value_dtype: Optional[
165+
Union[bigframes.dtypes.Dtype, bigframes.dtypes.DtypeString]
166+
] = None,
167+
) -> series.Series:
168+
"""Extracts a JSON array and converts it to a SQL array of `STRING` values.
169+
A `value_dtype` can be provided to further coerce the data type of the
170+
values in the array. This function uses single quotes and brackets to escape
171+
invalid JSONPath characters in JSON keys.
172+
173+
**Examples:**
174+
175+
>>> import bigframes.pandas as bpd
176+
>>> import bigframes.bigquery as bbq
177+
>>> bpd.options.display.progress_bar = None
178+
179+
>>> s = bpd.Series(['[1, 2, 3]', '[4, 5]'])
180+
>>> bbq.json_extract_string_array(s)
181+
0 ['1' '2' '3']
182+
1 ['4' '5']
183+
dtype: list<item: string>[pyarrow]
184+
185+
>>> bbq.json_extract_string_array(s, value_dtype='Int64')
186+
0 [1 2 3]
187+
1 [4 5]
188+
dtype: list<item: int64>[pyarrow]
189+
190+
>>> s = bpd.Series([
191+
... '{"fruits": {"color": "red", "names": ["apple","cherry"]}}',
192+
... '{"fruits": {"color": "green", "names": ["guava", "grapes"]}}'
193+
... ])
194+
>>> bbq.json_extract_string_array(s, "$.fruits.names")
195+
0 ['apple' 'cherry']
196+
1 ['guava' 'grapes']
197+
dtype: list<item: string>[pyarrow]
198+
199+
Args:
200+
input (bigframes.series.Series):
201+
The Series containing JSON data (as native JSON objects or JSON-formatted strings).
202+
json_path (str):
203+
The JSON path identifying the data that you want to obtain from the input.
204+
value_dtype (dtype, Optional):
205+
The data type supported by BigFrames DataFrame.
206+
207+
Returns:
208+
bigframes.series.Series: A new Series with the parsed arrays from the input.
209+
"""
210+
array_series = input._apply_unary_op(
211+
ops.JSONExtractStringArray(json_path=json_path)
212+
)
213+
if value_dtype not in [None, bigframes.dtypes.STRING_DTYPE]:
214+
array_items_series = array_series.explode()
215+
if value_dtype == bigframes.dtypes.BOOL_DTYPE:
216+
array_items_series = array_items_series.str.lower() == "true"
217+
else:
218+
array_items_series = array_items_series.astype(value_dtype)
219+
array_series = cast(
220+
series.Series,
221+
array.array_agg(
222+
array_items_series.groupby(level=input.index.names, dropna=False)
223+
),
224+
)
225+
return array_series

bigframes/core/compile/scalar_op_compiler.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1140,6 +1140,13 @@ def json_extract_array_op_impl(x: ibis_types.Value, op: ops.JSONExtractArray):
11401140
return json_extract_array(json_obj=x, json_path=op.json_path)
11411141

11421142

1143+
@scalar_op_compiler.register_unary_op(ops.JSONExtractStringArray, pass_op=True)
1144+
def json_extract_string_array_op_impl(
1145+
x: ibis_types.Value, op: ops.JSONExtractStringArray
1146+
):
1147+
return json_extract_string_array(json_obj=x, json_path=op.json_path)
1148+
1149+
11431150
### Binary Ops
11441151
def short_circuit_nulls(type_override: typing.Optional[ibis_dtypes.DataType] = None):
11451152
"""Wraps a binary operator to generate nulls of the expected type if either input is a null scalar."""
@@ -1801,6 +1808,13 @@ def json_extract_array(
18011808
"""Extracts a JSON array and converts it to a SQL ARRAY of JSON-formatted STRINGs or JSON values."""
18021809

18031810

1811+
@ibis.udf.scalar.builtin(name="json_extract_string_array")
1812+
def json_extract_string_array(
1813+
json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.str
1814+
) -> ibis_dtypes.Array[ibis_dtypes.String]:
1815+
"""Extracts a JSON array and converts it to a SQL ARRAY of STRINGs."""
1816+
1817+
18041818
@ibis.udf.scalar.builtin(name="ML.DISTANCE")
18051819
def vector_distance(vector1, vector2, type: str) -> ibis_dtypes.Float64:
18061820
"""Computes the distance between two vectors using specified type ("EUCLIDEAN", "MANHATTAN", or "COSINE")"""

bigframes/operations/__init__.py

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
from pandas.tseries.offsets import DateOffset
2626
import pyarrow as pa
2727

28-
import bigframes.dtypes
2928
import bigframes.dtypes as dtypes
3029
import bigframes.operations.type as op_typing
3130

@@ -526,6 +525,13 @@ class RemoteFunctionOp(UnaryOp):
526525
def output_type(self, *input_types):
527526
# This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method
528527
if hasattr(self.func, "output_dtype"):
528+
if dtypes.is_array_like(self.func.output_dtype):
529+
# TODO(b/284515241): remove this special handling to support
530+
# array output types once BQ remote functions support ARRAY.
531+
# Until then, use json serialized strings at the remote function
532+
# level, and parse that to the intended output type at the
533+
# bigframes level.
534+
return dtypes.STRING_DTYPE
529535
return self.func.output_dtype
530536
else:
531537
raise AttributeError("output_dtype not defined")
@@ -548,9 +554,9 @@ class ToDatetimeOp(UnaryOp):
548554

549555
def output_type(self, *input_types):
550556
if input_types[0] not in (
551-
bigframes.dtypes.FLOAT_DTYPE,
552-
bigframes.dtypes.INT_DTYPE,
553-
bigframes.dtypes.STRING_DTYPE,
557+
dtypes.FLOAT_DTYPE,
558+
dtypes.INT_DTYPE,
559+
dtypes.STRING_DTYPE,
554560
):
555561
raise TypeError("expected string or numeric input")
556562
return pd.ArrowDtype(pa.timestamp("us", tz=None))
@@ -565,9 +571,9 @@ class ToTimestampOp(UnaryOp):
565571
def output_type(self, *input_types):
566572
# Must be numeric or string
567573
if input_types[0] not in (
568-
bigframes.dtypes.FLOAT_DTYPE,
569-
bigframes.dtypes.INT_DTYPE,
570-
bigframes.dtypes.STRING_DTYPE,
574+
dtypes.FLOAT_DTYPE,
575+
dtypes.INT_DTYPE,
576+
dtypes.STRING_DTYPE,
571577
):
572578
raise TypeError("expected string or numeric input")
573579
return pd.ArrowDtype(pa.timestamp("us", tz="UTC"))
@@ -699,6 +705,23 @@ def output_type(self, *input_types):
699705
)
700706

701707

708+
@dataclasses.dataclass(frozen=True)
709+
class JSONExtractStringArray(UnaryOp):
710+
name: typing.ClassVar[str] = "json_extract_string_array"
711+
json_path: str
712+
713+
def output_type(self, *input_types):
714+
input_type = input_types[0]
715+
if not dtypes.is_json_like(input_type):
716+
raise TypeError(
717+
"Input type must be an valid JSON object or JSON-formatted string type."
718+
+ f" Received type: {input_type}"
719+
)
720+
return pd.ArrowDtype(
721+
pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(dtypes.STRING_DTYPE))
722+
)
723+
724+
702725
# Binary Ops
703726
fillna_op = create_binary_op(name="fillna", type_signature=op_typing.COERCE)
704727
maximum_op = create_binary_op(name="maximum", type_signature=op_typing.COERCE)

tests/system/small/bigquery/test_json.py

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import pytest
2020

2121
import bigframes.bigquery as bbq
22+
import bigframes.dtypes
2223
import bigframes.pandas as bpd
2324

2425

@@ -142,9 +143,9 @@ def test_json_extract_w_invalid_series_type():
142143

143144

144145
def test_json_extract_array_from_json_strings():
145-
s = bpd.Series(['{"a": [1, 2, 3]}', '{"a": []}', '{"a": [4,5]}'])
146+
s = bpd.Series(['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}'])
146147
actual = bbq.json_extract_array(s, "$.a")
147-
expected = bpd.Series([["1", "2", "3"], [], ["4", "5"]])
148+
expected = bpd.Series([['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"']])
148149
pd.testing.assert_series_equal(
149150
actual.to_pandas(),
150151
expected.to_pandas(),
@@ -164,3 +165,38 @@ def test_json_extract_array_from_array_strings():
164165
def test_json_extract_array_w_invalid_series_type():
165166
with pytest.raises(TypeError):
166167
bbq.json_extract_array(bpd.Series([1, 2]))
168+
169+
170+
def test_json_extract_string_array_from_json_strings():
171+
s = bpd.Series(['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}'])
172+
actual = bbq.json_extract_string_array(s, "$.a")
173+
expected = bpd.Series([["ab", "2", "3 xy"], [], ["4", "5"]])
174+
pd.testing.assert_series_equal(
175+
actual.to_pandas(),
176+
expected.to_pandas(),
177+
)
178+
179+
180+
def test_json_extract_string_array_from_array_strings():
181+
s = bpd.Series(["[1, 2, 3]", "[]", "[4,5]"])
182+
actual = bbq.json_extract_string_array(s)
183+
expected = bpd.Series([["1", "2", "3"], [], ["4", "5"]])
184+
pd.testing.assert_series_equal(
185+
actual.to_pandas(),
186+
expected.to_pandas(),
187+
)
188+
189+
190+
def test_json_extract_string_array_as_float_array_from_array_strings():
191+
s = bpd.Series(["[1, 2.5, 3]", "[]", "[4,5]"])
192+
actual = bbq.json_extract_string_array(s, value_dtype=bigframes.dtypes.FLOAT_DTYPE)
193+
expected = bpd.Series([[1, 2.5, 3], [], [4, 5]])
194+
pd.testing.assert_series_equal(
195+
actual.to_pandas(),
196+
expected.to_pandas(),
197+
)
198+
199+
200+
def test_json_extract_string_array_w_invalid_series_type():
201+
with pytest.raises(TypeError):
202+
bbq.json_extract_string_array(bpd.Series([1, 2]))

0 commit comments

Comments
 (0)