Skip to content

Commit 019051e

Browse files
feat: Add bbq.json_value_array and deprecate bbq.json_extract_string_array (#1818)
This commit introduces the `bbq.json_value_array` method, which provides similar functionality to `JSON_VALUE_ARRAY` in BigQuery Standard SQL. The `bbq.json_extract_string_array` method has been marked as deprecated and will be removed in a future version. You should migrate to `bbq.json_value_array` for equivalent functionality. Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com>
1 parent 8ebfa57 commit 019051e

File tree

6 files changed

+149
-2
lines changed

6 files changed

+149
-2
lines changed

bigframes/bigquery/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
json_query_array,
4444
json_set,
4545
json_value,
46+
json_value_array,
4647
parse_json,
4748
)
4849
from bigframes.bigquery._operations.search import create_vector_index, vector_search
@@ -71,6 +72,7 @@
7172
"json_query_array",
7273
"json_set",
7374
"json_value",
75+
"json_value_array",
7476
"parse_json",
7577
# search ops
7678
"create_vector_index",

bigframes/bigquery/_operations/json.py

Lines changed: 65 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,10 @@ def json_extract_string_array(
196196
values in the array. This function uses single quotes and brackets to escape
197197
invalid JSONPath characters in JSON keys.
198198
199+
.. deprecated:: 2.6.0
200+
The ``json_extract_string_array`` is deprecated and will be removed in a future version.
201+
Use ``json_value_array`` instead.
202+
199203
**Examples:**
200204
201205
>>> import bigframes.pandas as bpd
@@ -233,6 +237,11 @@ def json_extract_string_array(
233237
Returns:
234238
bigframes.series.Series: A new Series with the parsed arrays from the input.
235239
"""
240+
msg = (
241+
"The `json_extract_string_array` is deprecated and will be removed in a future version. "
242+
"Use `json_value_array` instead."
243+
)
244+
warnings.warn(bfe.format_message(msg), category=UserWarning)
236245
array_series = input._apply_unary_op(
237246
ops.JSONExtractStringArray(json_path=json_path)
238247
)
@@ -334,7 +343,7 @@ def json_query_array(
334343

335344
def json_value(
336345
input: series.Series,
337-
json_path: str,
346+
json_path: str = "$",
338347
) -> series.Series:
339348
"""Extracts a JSON scalar value and converts it to a SQL ``STRING`` value. In
340349
addtion, this function:
@@ -366,6 +375,61 @@ def json_value(
366375
return input._apply_unary_op(ops.JSONValue(json_path=json_path))
367376

368377

378+
def json_value_array(
379+
input: series.Series,
380+
json_path: str = "$",
381+
) -> series.Series:
382+
"""
383+
Extracts a JSON array of scalar values and converts it to a SQL ``ARRAY<STRING>``
384+
value. In addition, this function:
385+
386+
- Removes the outermost quotes and unescapes the values.
387+
- Returns a SQL ``NULL`` if the selected value isn't an array or not an array
388+
containing only scalar values.
389+
- Uses double quotes to escape invalid ``JSON_PATH`` characters in JSON keys.
390+
391+
**Examples:**
392+
393+
>>> import bigframes.pandas as bpd
394+
>>> import bigframes.bigquery as bbq
395+
>>> bpd.options.display.progress_bar = None
396+
397+
>>> s = bpd.Series(['[1, 2, 3]', '[4, 5]'])
398+
>>> bbq.json_value_array(s)
399+
0 ['1' '2' '3']
400+
1 ['4' '5']
401+
dtype: list<item: string>[pyarrow]
402+
403+
>>> s = bpd.Series([
404+
... '{"fruits": ["apples", "oranges", "grapes"]',
405+
... '{"fruits": ["guava", "grapes"]}'
406+
... ])
407+
>>> bbq.json_value_array(s, "$.fruits")
408+
0 ['apples' 'oranges' 'grapes']
409+
1 ['guava' 'grapes']
410+
dtype: list<item: string>[pyarrow]
411+
412+
>>> s = bpd.Series([
413+
... '{"fruits": {"color": "red", "names": ["apple","cherry"]}}',
414+
... '{"fruits": {"color": "green", "names": ["guava", "grapes"]}}'
415+
... ])
416+
>>> bbq.json_value_array(s, "$.fruits.names")
417+
0 ['apple' 'cherry']
418+
1 ['guava' 'grapes']
419+
dtype: list<item: string>[pyarrow]
420+
421+
Args:
422+
input (bigframes.series.Series):
423+
The Series containing JSON data (as native JSON objects or JSON-formatted strings).
424+
json_path (str):
425+
The JSON path identifying the data that you want to obtain from the input.
426+
427+
Returns:
428+
bigframes.series.Series: A new Series with the parsed arrays from the input.
429+
"""
430+
return input._apply_unary_op(ops.JSONValueArray(json_path=json_path))
431+
432+
369433
@utils.preview(name="The JSON-related API `parse_json`")
370434
def parse_json(
371435
input: series.Series,

bigframes/core/compile/scalar_op_compiler.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1448,6 +1448,11 @@ def json_value_op_impl(x: ibis_types.Value, op: ops.JSONValue):
14481448
return json_value(json_obj=x, json_path=op.json_path)
14491449

14501450

1451+
@scalar_op_compiler.register_unary_op(ops.JSONValueArray, pass_op=True)
1452+
def json_value_array_op_impl(x: ibis_types.Value, op: ops.JSONValueArray):
1453+
return json_value_array(json_obj=x, json_path=op.json_path)
1454+
1455+
14511456
# Blob Ops
14521457
@scalar_op_compiler.register_unary_op(ops.obj_fetch_metadata_op)
14531458
def obj_fetch_metadata_op_impl(obj_ref: ibis_types.Value):
@@ -2157,6 +2162,13 @@ def json_value( # type: ignore[empty-body]
21572162
"""Retrieve value of a JSON field as plain STRING."""
21582163

21592164

2165+
@ibis_udf.scalar.builtin(name="json_value_array")
2166+
def json_value_array( # type: ignore[empty-body]
2167+
json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String
2168+
) -> ibis_dtypes.Array[ibis_dtypes.String]:
2169+
"""Extracts a JSON array and converts it to a SQL ARRAY of STRINGs."""
2170+
2171+
21602172
@ibis_udf.scalar.builtin(name="INT64")
21612173
def cast_json_to_int64(json_str: ibis_dtypes.JSON) -> ibis_dtypes.Int64: # type: ignore[empty-body]
21622174
"""Converts a JSON number to a SQL INT64 value."""

bigframes/operations/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@
112112
JSONQueryArray,
113113
JSONSet,
114114
JSONValue,
115+
JSONValueArray,
115116
ParseJSON,
116117
ToJSONString,
117118
)
@@ -363,6 +364,7 @@
363364
"JSONQueryArray",
364365
"JSONSet",
365366
"JSONValue",
367+
"JSONValueArray",
366368
"ParseJSON",
367369
"ToJSONString",
368370
# Bool ops

bigframes/operations/json_ops.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,23 @@ def output_type(self, *input_types):
153153
return dtypes.STRING_DTYPE
154154

155155

156+
@dataclasses.dataclass(frozen=True)
157+
class JSONValueArray(base_ops.UnaryOp):
158+
name: typing.ClassVar[str] = "json_value_array"
159+
json_path: str
160+
161+
def output_type(self, *input_types):
162+
input_type = input_types[0]
163+
if not dtypes.is_json_like(input_type):
164+
raise TypeError(
165+
"Input type must be a valid JSON object or JSON-formatted string type."
166+
+ f" Received type: {input_type}"
167+
)
168+
return pd.ArrowDtype(
169+
pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(dtypes.STRING_DTYPE))
170+
)
171+
172+
156173
@dataclasses.dataclass(frozen=True)
157174
class JSONQuery(base_ops.UnaryOp):
158175
name: typing.ClassVar[str] = "json_query"

tests/system/small/bigquery/test_json.py

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,10 @@ def test_json_extract_array_w_invalid_series_type():
186186

187187
def test_json_extract_string_array_from_json_strings():
188188
s = bpd.Series(['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}'])
189-
actual = bbq.json_extract_string_array(s, "$.a")
189+
with pytest.warns(
190+
UserWarning, match="The `json_extract_string_array` is deprecated"
191+
):
192+
actual = bbq.json_extract_string_array(s, "$.a")
190193
expected = bpd.Series([["ab", "2", "3 xy"], [], ["4", "5"]])
191194

192195
pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas())
@@ -214,6 +217,53 @@ def test_json_extract_string_array_w_invalid_series_type():
214217
bbq.json_extract_string_array(s)
215218

216219

220+
def test_json_value_array_from_json_strings():
221+
s = bpd.Series(['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}'])
222+
actual = bbq.json_value_array(s, "$.a")
223+
expected_data = [["ab", "2", "3 xy"], [], ["4", "5"]]
224+
# Expected dtype after JSON_VALUE_ARRAY is ARRAY<STRING>
225+
expected = bpd.Series(expected_data, dtype=pd.ArrowDtype(pa.list_(pa.string())))
226+
pd.testing.assert_series_equal(
227+
actual.to_pandas(),
228+
expected.to_pandas(),
229+
)
230+
231+
232+
def test_json_value_array_from_array_strings():
233+
s = bpd.Series(["[1, 2, 3]", "[]", "[4,5]"])
234+
actual = bbq.json_value_array(s)
235+
expected_data = [["1", "2", "3"], [], ["4", "5"]]
236+
expected = bpd.Series(expected_data, dtype=pd.ArrowDtype(pa.list_(pa.string())))
237+
pd.testing.assert_series_equal(
238+
actual.to_pandas(),
239+
expected.to_pandas(),
240+
)
241+
242+
243+
def test_json_value_array_w_invalid_series_type():
244+
s = bpd.Series([1, 2], dtype=dtypes.INT_DTYPE) # Not a JSON-like string
245+
with pytest.raises(TypeError):
246+
bbq.json_value_array(s)
247+
248+
249+
def test_json_value_array_from_json_native():
250+
json_data = [
251+
'{"key": ["hello", "world"]}',
252+
'{"key": ["123", "45.6"]}',
253+
'{"key": []}',
254+
"{}", # case with missing key
255+
]
256+
s = bpd.Series(json_data, dtype=dtypes.JSON_DTYPE)
257+
actual = bbq.json_value_array(s, json_path="$.key")
258+
259+
expected_data_pandas = [["hello", "world"], ["123", "45.6"], [], None]
260+
expected = bpd.Series(
261+
expected_data_pandas, dtype=pd.ArrowDtype(pa.list_(pa.string()))
262+
).fillna(pd.NA)
263+
result_pd = actual.to_pandas().fillna(pd.NA)
264+
pd.testing.assert_series_equal(result_pd, expected.to_pandas())
265+
266+
217267
def test_json_query_from_json():
218268
s = bpd.Series(
219269
['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'],

0 commit comments

Comments
 (0)