Skip to content

Commit f672262

Browse files
chelsea-lingcf-owl-bot[bot]tswast
authored
feat!: Enable reading JSON data with dbjson extension dtype (#1139)
This change updates how we handle JSON data types read from BigQuery. Previously, BigQuery JSON types were treated as generic large strings within our system. To improve accuracy and functionality, we now map them to a dedicated JSON data type (db_dtypes.JSONType or db_dtypes.JSONArrowType for pyarrow). While this provides a more appropriate representation of JSON data, it's important to note that this feature is still in preview and may evolve. Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com> Co-authored-by: Tim Sweña (Swast) <[email protected]> Release-As: 1.34.0
1 parent 57746e1 commit f672262

File tree

12 files changed

+224
-95
lines changed

12 files changed

+224
-95
lines changed

bigframes/bigquery/_operations/json.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def json_set(
5353
>>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"]
5454
>>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")])
5555
0 {"a":100,"b":"hi"}
56-
Name: data, dtype: large_string[pyarrow]
56+
Name: data, dtype: dbjson
5757
5858
Args:
5959
input (bigframes.series.Series):
@@ -253,7 +253,7 @@ def parse_json(
253253
dtype: string
254254
>>> bbq.parse_json(s)
255255
0 {"class":{"students":[{"id":5},{"id":12}]}}
256-
dtype: large_string[pyarrow]
256+
dtype: dbjson
257257
258258
Args:
259259
input (bigframes.series.Series):

bigframes/core/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,8 +108,8 @@ def from_table(
108108
raise ValueError("must set at most one of 'offests', 'primary_key'")
109109
if any(i.field_type == "JSON" for i in table.schema if i.name in schema.names):
110110
msg = (
111-
"Interpreting JSON column(s) as pyarrow.large_string. "
112-
"This behavior may change in future versions."
111+
"Interpreting JSON column(s) as the `db_dtypes.dbjson` extension type is"
112+
"in preview; this behavior may change in future versions."
113113
)
114114
warnings.warn(msg, bfe.PreviewWarning)
115115
# define data source only for needed columns, this makes row-hashing cheaper

bigframes/core/compile/ibis_types.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
import textwrap
1717
import typing
1818
from typing import Any, cast, Dict, Iterable, Optional, Tuple, Union
19-
import warnings
2019

2120
import bigframes_vendored.constants as constants
2221
import bigframes_vendored.ibis
@@ -26,14 +25,14 @@
2625
dtype as python_type_to_ibis_type,
2726
)
2827
import bigframes_vendored.ibis.expr.types as ibis_types
28+
import db_dtypes # type: ignore
2929
import geopandas as gpd # type: ignore
3030
import google.cloud.bigquery as bigquery
3131
import numpy as np
3232
import pandas as pd
3333
import pyarrow as pa
3434

3535
import bigframes.dtypes
36-
import bigframes.exceptions as bfe
3736

3837
# Type hints for Ibis data types supported by BigQuery DataFrame
3938
IbisDtype = Union[
@@ -76,7 +75,7 @@
7675
ibis_dtypes.GeoSpatial(geotype="geography", srid=4326, nullable=True),
7776
gpd.array.GeometryDtype(),
7877
),
79-
(ibis_dtypes.json, pd.ArrowDtype(pa.large_string())),
78+
(ibis_dtypes.json, db_dtypes.JSONDtype()),
8079
)
8180

8281
BIGFRAMES_TO_IBIS: Dict[bigframes.dtypes.Dtype, ibis_dtypes.DataType] = {
@@ -305,13 +304,7 @@ def ibis_dtype_to_bigframes_dtype(
305304
if isinstance(ibis_dtype, ibis_dtypes.Integer):
306305
return pd.Int64Dtype()
307306

308-
# Temporary: Will eventually support an explicit json type instead of casting to string.
309307
if isinstance(ibis_dtype, ibis_dtypes.JSON):
310-
msg = (
311-
"Interpreting JSON column(s) as pyarrow.large_string. This behavior may change "
312-
"in future versions."
313-
)
314-
warnings.warn(msg, category=bfe.PreviewWarning)
315308
return bigframes.dtypes.JSON_DTYPE
316309

317310
if ibis_dtype in IBIS_TO_BIGFRAMES:

bigframes/core/compile/scalar_op_compiler.py

Lines changed: 19 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1188,34 +1188,33 @@ def array_slice_op_impl(x: ibis_types.Value, op: ops.ArraySliceOp):
11881188
# JSON Ops
11891189
@scalar_op_compiler.register_binary_op(ops.JSONSet, pass_op=True)
11901190
def json_set_op_impl(x: ibis_types.Value, y: ibis_types.Value, op: ops.JSONSet):
1191-
if x.type().is_json():
1192-
return json_set(
1193-
json_obj=x,
1194-
json_path=op.json_path,
1195-
json_value=y,
1196-
)
1197-
else:
1198-
# Enabling JSON type eliminates the need for less efficient string conversions.
1199-
return to_json_string(
1200-
json_set( # type: ignore
1201-
json_obj=parse_json(json_str=x),
1202-
json_path=op.json_path,
1203-
json_value=y,
1204-
)
1205-
)
1191+
return json_set(json_obj=x, json_path=op.json_path, json_value=y)
12061192

12071193

12081194
@scalar_op_compiler.register_unary_op(ops.JSONExtract, pass_op=True)
12091195
def json_extract_op_impl(x: ibis_types.Value, op: ops.JSONExtract):
1210-
if x.type().is_json():
1211-
return json_extract(json_obj=x, json_path=op.json_path)
1212-
# json string
1213-
return json_extract_string(json_obj=x, json_path=op.json_path)
1196+
# Define a user-defined function whose returned type is dynamically matching the input.
1197+
def json_extract(json_or_json_string, json_path: ibis_dtypes.str): # type: ignore
1198+
"""Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value."""
1199+
...
1200+
1201+
return_type = x.type()
1202+
json_extract.__annotations__["return"] = return_type
1203+
json_extract_op = ibis_udf.scalar.builtin(json_extract)
1204+
return json_extract_op(json_or_json_string=x, json_path=op.json_path)
12141205

12151206

12161207
@scalar_op_compiler.register_unary_op(ops.JSONExtractArray, pass_op=True)
12171208
def json_extract_array_op_impl(x: ibis_types.Value, op: ops.JSONExtractArray):
1218-
return json_extract_array(json_obj=x, json_path=op.json_path)
1209+
# Define a user-defined function whose returned type is dynamically matching the input.
1210+
def json_extract_array(json_or_json_string, json_path: ibis_dtypes.str): # type: ignore
1211+
"""Extracts a JSON value and converts it to a SQL JSON-formatted STRING or JSON value."""
1212+
...
1213+
1214+
return_type = x.type()
1215+
json_extract_array.__annotations__["return"] = ibis_dtypes.Array[return_type] # type: ignore
1216+
json_extract_op = ibis_udf.scalar.builtin(json_extract_array)
1217+
return json_extract_op(json_or_json_string=x, json_path=op.json_path)
12191218

12201219

12211220
@scalar_op_compiler.register_unary_op(ops.JSONExtractStringArray, pass_op=True)
@@ -1937,27 +1936,6 @@ def json_set( # type: ignore[empty-body]
19371936
"""Produces a new SQL JSON value with the specified JSON data inserted or replaced."""
19381937

19391938

1940-
@ibis_udf.scalar.builtin(name="json_extract")
1941-
def json_extract( # type: ignore[empty-body]
1942-
json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String
1943-
) -> ibis_dtypes.JSON:
1944-
"""Extracts a JSON value and converts it to a JSON value."""
1945-
1946-
1947-
@ibis_udf.scalar.builtin(name="json_extract")
1948-
def json_extract_string( # type: ignore[empty-body]
1949-
json_obj: ibis_dtypes.String, json_path: ibis_dtypes.String
1950-
) -> ibis_dtypes.String:
1951-
"""Extracts a JSON SRING value and converts it to a SQL JSON-formatted STRING."""
1952-
1953-
1954-
@ibis_udf.scalar.builtin(name="json_extract_array")
1955-
def json_extract_array( # type: ignore[empty-body]
1956-
json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String
1957-
) -> ibis_dtypes.Array[ibis_dtypes.String]:
1958-
"""Extracts a JSON array and converts it to a SQL ARRAY of JSON-formatted STRINGs or JSON values."""
1959-
1960-
19611939
@ibis_udf.scalar.builtin(name="json_extract_string_array")
19621940
def json_extract_string_array( # type: ignore[empty-body]
19631941
json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String

bigframes/dtypes.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from typing import Any, Dict, List, Literal, Union
2222

2323
import bigframes_vendored.constants as constants
24+
import db_dtypes # type: ignore
2425
import geopandas as gpd # type: ignore
2526
import google.cloud.bigquery
2627
import numpy as np
@@ -59,7 +60,7 @@
5960
# No arrow equivalent
6061
GEO_DTYPE = gpd.array.GeometryDtype()
6162
# JSON
62-
JSON_DTYPE = pd.ArrowDtype(pa.large_string())
63+
JSON_DTYPE = db_dtypes.JSONDtype()
6364
OBJ_REF_DTYPE = pd.ArrowDtype(
6465
pa.struct(
6566
(
@@ -161,7 +162,7 @@ class SimpleDtypeInfo:
161162
),
162163
SimpleDtypeInfo(
163164
dtype=JSON_DTYPE,
164-
arrow_dtype=pa.large_string(),
165+
arrow_dtype=db_dtypes.JSONArrowType(),
165166
type_kind=("JSON",),
166167
orderable=False,
167168
clusterable=False,
@@ -320,7 +321,6 @@ def is_struct_like(type_: ExpressionType) -> bool:
320321

321322

322323
def is_json_like(type_: ExpressionType) -> bool:
323-
# TODO: Add JSON type support
324324
return type_ == JSON_DTYPE or type_ == STRING_DTYPE # Including JSON string
325325

326326

bigframes/operations/json_ops.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def output_type(self, *input_types):
5050
+ f" Received type: {input_type}"
5151
)
5252
return pd.ArrowDtype(
53-
pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(dtypes.STRING_DTYPE))
53+
pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(input_type))
5454
)
5555

5656

@@ -118,8 +118,7 @@ def output_type(self, *input_types):
118118
+ f"Received type: {right_type}"
119119
)
120120

121-
# After JSON type implementation, ONLY return JSON data.
122-
return left_type
121+
return dtypes.JSON_DTYPE
123122

124123

125124
@dataclasses.dataclass(frozen=True)

bigframes/session/_io/pandas.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from typing import Collection, Union
1818

1919
import bigframes_vendored.constants as constants
20+
import db_dtypes # type: ignore
2021
import geopandas # type: ignore
2122
import numpy as np
2223
import pandas
@@ -122,6 +123,8 @@ def arrow_to_pandas(
122123
)
123124
elif isinstance(dtype, pandas.ArrowDtype):
124125
series = _arrow_to_pandas_arrowdtype(column, dtype)
126+
elif isinstance(dtype, db_dtypes.JSONDtype):
127+
series = db_dtypes.JSONArray(column)
125128
else:
126129
series = column.to_pandas(types_mapper=lambda _: dtype)
127130

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@
6262
"ipywidgets >=7.7.1",
6363
"humanize >=4.6.0",
6464
"matplotlib >=3.7.1",
65+
"db-dtypes >=1.4.0",
6566
# For vendored ibis-framework.
6667
"atpublic>=2.3,<6",
6768
"parsy>=2,<3",

testing/constraints-3.9.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ tabulate==0.9
2626
ipywidgets==7.7.1
2727
humanize==4.6.0
2828
matplotlib==3.7.1
29+
db-dtypes==1.4.0
2930
# For vendored ibis-framework.
3031
atpublic==2.3
3132
parsy==2.0

tests/system/small/bigquery/test_json.py

Lines changed: 46 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,6 @@ def test_json_set_w_invalid_series_type():
118118
def test_json_extract_from_json():
119119
s = _get_series_from_json([{"a": {"b": [1, 2]}}, {"a": {"c": 1}}, {"a": {"b": 0}}])
120120
actual = bbq.json_extract(s, "$.a.b").to_pandas()
121-
# After the introduction of the JSON type, the output should be a JSON-formatted series.
122121
expected = _get_series_from_json([[1, 2], None, 0]).to_pandas()
123122
pd.testing.assert_series_equal(
124123
actual,
@@ -129,12 +128,10 @@ def test_json_extract_from_json():
129128
def test_json_extract_from_string():
130129
s = bpd.Series(['{"a": {"b": [1, 2]}}', '{"a": {"c": 1}}', '{"a": {"b": 0}}'])
131130
actual = bbq.json_extract(s, "$.a.b")
132-
expected = _get_series_from_json([[1, 2], None, 0])
131+
expected = bpd.Series(["[1,2]", None, "0"])
133132
pd.testing.assert_series_equal(
134133
actual.to_pandas(),
135134
expected.to_pandas(),
136-
check_names=False,
137-
check_dtype=False, # json_extract returns string type. While _get_series_from_json gives a JSON series (pa.large_string).
138135
)
139136

140137

@@ -143,29 +140,68 @@ def test_json_extract_w_invalid_series_type():
143140
bbq.json_extract(bpd.Series([1, 2]), "$.a")
144141

145142

143+
def test_json_extract_array_from_json():
144+
s = _get_series_from_json(
145+
[{"a": ["ab", "2", "3 xy"]}, {"a": []}, {"a": ["4", "5"]}, {}]
146+
)
147+
actual = bbq.json_extract_array(s, "$.a")
148+
149+
# This code provides a workaround for issue https://github.com/apache/arrow/issues/45262,
150+
# which currently prevents constructing a series using the pa.list_(db_types.JSONArrrowType())
151+
sql = """
152+
SELECT 0 AS id, [JSON '"ab"', JSON '"2"', JSON '"3 xy"'] AS data,
153+
UNION ALL
154+
SELECT 1, [],
155+
UNION ALL
156+
SELECT 2, [JSON '"4"', JSON '"5"'],
157+
UNION ALL
158+
SELECT 3, null,
159+
"""
160+
df = bpd.read_gbq(sql).set_index("id").sort_index()
161+
expected = df["data"]
162+
163+
pd.testing.assert_series_equal(
164+
actual.to_pandas(),
165+
expected.to_pandas(),
166+
)
167+
168+
146169
def test_json_extract_array_from_json_strings():
147-
s = bpd.Series(['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}'])
170+
s = bpd.Series(
171+
['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}', "{}"],
172+
dtype=pd.StringDtype(storage="pyarrow"),
173+
)
148174
actual = bbq.json_extract_array(s, "$.a")
149-
expected = bpd.Series([['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"']])
175+
expected = bpd.Series(
176+
[['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"'], None],
177+
dtype=pd.StringDtype(storage="pyarrow"),
178+
)
150179
pd.testing.assert_series_equal(
151180
actual.to_pandas(),
152181
expected.to_pandas(),
153182
)
154183

155184

156-
def test_json_extract_array_from_array_strings():
157-
s = bpd.Series(["[1, 2, 3]", "[]", "[4,5]"])
185+
def test_json_extract_array_from_json_array_strings():
186+
s = bpd.Series(
187+
["[1, 2, 3]", "[]", "[4,5]"],
188+
dtype=pd.StringDtype(storage="pyarrow"),
189+
)
158190
actual = bbq.json_extract_array(s)
159-
expected = bpd.Series([["1", "2", "3"], [], ["4", "5"]])
191+
expected = bpd.Series(
192+
[["1", "2", "3"], [], ["4", "5"]],
193+
dtype=pd.StringDtype(storage="pyarrow"),
194+
)
160195
pd.testing.assert_series_equal(
161196
actual.to_pandas(),
162197
expected.to_pandas(),
163198
)
164199

165200

166201
def test_json_extract_array_w_invalid_series_type():
202+
s = bpd.Series([1, 2])
167203
with pytest.raises(TypeError):
168-
bbq.json_extract_array(bpd.Series([1, 2]))
204+
bbq.json_extract_array(s)
169205

170206

171207
def test_json_extract_string_array_from_json_strings():
@@ -203,14 +239,6 @@ def test_json_extract_string_array_w_invalid_series_type():
203239
bbq.json_extract_string_array(bpd.Series([1, 2]))
204240

205241

206-
# b/381148539
207-
def test_json_in_struct():
208-
df = bpd.read_gbq(
209-
"SELECT STRUCT(JSON '{\\\"a\\\": 1}' AS data, 1 AS number) as struct_col"
210-
)
211-
assert df["struct_col"].struct.field("data")[0] == '{"a":1}'
212-
213-
214242
def test_parse_json_w_invalid_series_type():
215243
with pytest.raises(TypeError):
216244
bbq.parse_json(bpd.Series([1, 2]))

0 commit comments

Comments
 (0)