Skip to content

Commit 2de89c3

Browse files
authored
chore: fix experimental blob operations json string values (#1299)
1 parent 94bc2f2 commit 2de89c3

File tree

5 files changed

+40
-18
lines changed

5 files changed

+40
-18
lines changed

bigframes/core/compile/scalar_op_compiler.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1220,6 +1220,11 @@ def to_json_string_op_impl(json_obj: ibis_types.Value):
12201220
return to_json_string(json_obj=json_obj)
12211221

12221222

1223+
@scalar_op_compiler.register_unary_op(ops.JSONValue, pass_op=True)
1224+
def json_value_op_impl(x: ibis_types.Value, op: ops.JSONValue):
1225+
return json_value(json_obj=x, json_path=op.json_path)
1226+
1227+
12231228
# Blob Ops
12241229
@scalar_op_compiler.register_unary_op(ops.obj_fetch_metadata_op)
12251230
def obj_fetch_metadata_op_impl(obj_ref: ibis_types.Value):
@@ -1931,6 +1936,13 @@ def to_json_string( # type: ignore[empty-body]
19311936
"""Convert JSON to STRING."""
19321937

19331938

1939+
@ibis_udf.scalar.builtin(name="json_value")
1940+
def json_value( # type: ignore[empty-body]
1941+
json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String
1942+
) -> ibis_dtypes.String:
1943+
"""Retrieve value of a JSON field as plain STRING."""
1944+
1945+
19341946
@ibis_udf.scalar.builtin(name="ML.DISTANCE")
19351947
def vector_distance(vector1, vector2, type: str) -> ibis_dtypes.Float64: # type: ignore[empty-body]
19361948
"""Computes the distance between two vectors using specified type ("EUCLIDEAN", "MANHATTAN", or "COSINE")"""

bigframes/dataframe.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -743,16 +743,16 @@ def _repr_html_(self) -> str:
743743

744744
df = self.copy()
745745
if bigframes.options.experiments.blob:
746-
import bigframes.bigquery as bbq
747-
748746
blob_cols = [
749747
col
750748
for col in df.columns
751749
if df[col].dtype == bigframes.dtypes.OBJ_REF_DTYPE
752750
]
753751
for col in blob_cols:
754752
df[col] = df[col]._apply_unary_op(ops.ObjGetAccessUrl(mode="R"))
755-
df[col] = bbq.json_extract(df[col], "$.access_urls.read_url")
753+
df[col] = df[col]._apply_unary_op(
754+
ops.JSONValue(json_path="$.access_urls.read_url")
755+
)
756756

757757
# TODO(swast): pass max_columns and get the true column count back. Maybe
758758
# get 1 more column than we have requested so that pandas can add the
@@ -770,8 +770,7 @@ def _repr_html_(self) -> str:
770770
if bigframes.options.experiments.blob:
771771

772772
def url_to_image_html(url: str) -> str:
773-
# url is a json string, which already contains double-quotes ""
774-
return f"<img src={url}>"
773+
return f'<img src="{url}">'
775774

776775
formatters = {blob_col: url_to_image_html for blob_col in blob_cols}
777776

bigframes/operations/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@
8686
JSONExtractArray,
8787
JSONExtractStringArray,
8888
JSONSet,
89+
JSONValue,
8990
ParseJSON,
9091
ToJSONString,
9192
)
@@ -298,6 +299,7 @@
298299
"JSONExtractArray",
299300
"JSONExtractStringArray",
300301
"JSONSet",
302+
"JSONValue",
301303
"ParseJSON",
302304
"ToJSONString",
303305
# Bool ops

bigframes/operations/blob.py

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def metadata(self) -> bigframes.series.Series:
4747
)
4848
import bigframes.bigquery as bbq
4949

50-
return bbq.json_extract(details_json, "$.gcs_metadata")
50+
return bbq.json_extract(details_json, "$.gcs_metadata").rename("metadata")
5151

5252
def content_type(self) -> bigframes.series.Series:
5353
"""Retrive the content type of the Blob.
@@ -57,11 +57,11 @@ def content_type(self) -> bigframes.series.Series:
5757
5858
Returns:
5959
BigFrames Series: json-string of the content type."""
60-
import bigframes.bigquery as bbq
61-
6260
metadata = self.metadata()
6361

64-
return bbq.json_extract(metadata, "$.content_type")
62+
return metadata._apply_unary_op(
63+
ops.JSONValue(json_path="$.content_type")
64+
).rename("content_type")
6565

6666
def display(self, n: int = 3, *, content_type: str = ""):
6767
"""Display the blob content in the IPython Notebook environment. Only works for image type now.
@@ -73,14 +73,12 @@ def display(self, n: int = 3, *, content_type: str = ""):
7373
n (int, default 3): number of sample blob objects to display.
7474
content_type (str, default ""): content type of the blob. If unset, use the blob metadata of the storage. Possible values are "image", "audio" and "video".
7575
"""
76-
import bigframes.bigquery as bbq
77-
7876
# col name doesn't matter here. Rename to avoid column name conflicts
7977
df = bigframes.series.Series(self._block).rename("blob_col").head(n).to_frame()
8078

8179
obj_ref_runtime = df["blob_col"]._apply_unary_op(ops.ObjGetAccessUrl(mode="R"))
82-
df["read_url"] = bbq.json_extract(
83-
obj_ref_runtime, json_path="$.access_urls.read_url"
80+
df["read_url"] = obj_ref_runtime._apply_unary_op(
81+
ops.JSONValue(json_path="$.access_urls.read_url")
8482
)
8583

8684
if content_type:
@@ -104,11 +102,7 @@ def display_single_url(read_url: str, content_type: str):
104102
ipy_display.display(response.content, raw=True)
105103

106104
for _, row in df.iterrows():
107-
# both are JSON-formated strings
108-
read_url = str(row["read_url"]).strip('"')
109-
content_type = str(row["content_type"]).strip('"')
110-
111-
display_single_url(read_url, content_type)
105+
display_single_url(row["read_url"], row["content_type"])
112106

113107
def image_blur(
114108
self,

bigframes/operations/json_ops.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,3 +120,18 @@ def output_type(self, *input_types):
120120

121121
# After JSON type implementation, ONLY return JSON data.
122122
return left_type
123+
124+
125+
@dataclasses.dataclass(frozen=True)
126+
class JSONValue(base_ops.UnaryOp):
127+
name: typing.ClassVar[str] = "json_value"
128+
json_path: str
129+
130+
def output_type(self, *input_types):
131+
input_type = input_types[0]
132+
if not dtypes.is_json_like(input_type):
133+
raise TypeError(
134+
"Input type must be an valid JSON object or JSON-formatted string type."
135+
+ f" Received type: {input_type}"
136+
)
137+
return dtypes.STRING_DTYPE

0 commit comments

Comments
 (0)