21
21
22
22
from __future__ import annotations
23
23
24
- from typing import Any , Sequence , Tuple
24
+ from typing import Any , cast , Optional , Sequence , Tuple , Union
25
25
26
+ import bigframes .dtypes
26
27
import bigframes .operations as ops
27
28
import bigframes .series as series
28
29
30
+ from . import array
31
+
29
32
30
33
def json_set (
31
- series : series .Series ,
34
+ input : series .Series ,
32
35
json_path_value_pairs : Sequence [Tuple [str , Any ]],
33
36
) -> series .Series :
34
37
"""Produces a new JSON value within a Series by inserting or replacing values at
@@ -47,7 +50,7 @@ def json_set(
47
50
Name: data, dtype: string
48
51
49
52
Args:
50
- series (bigframes.series.Series):
53
+ input (bigframes.series.Series):
51
54
The Series containing JSON data (as native JSON objects or JSON-formatted strings).
52
55
json_path_value_pairs (Sequence[Tuple[str, Any]]):
53
56
Pairs of JSON path and the new value to insert/replace.
@@ -59,6 +62,7 @@ def json_set(
59
62
# SQLGlot parser does not support the "create_if_missing => true" syntax, so
60
63
# create_if_missing is not currently implemented.
61
64
65
+ result = input
62
66
for json_path_value_pair in json_path_value_pairs :
63
67
if len (json_path_value_pair ) != 2 :
64
68
raise ValueError (
@@ -67,14 +71,14 @@ def json_set(
67
71
)
68
72
69
73
json_path , json_value = json_path_value_pair
70
- series = series ._apply_binary_op (
74
+ result = result ._apply_binary_op (
71
75
json_value , ops .JSONSet (json_path = json_path ), alignment = "left"
72
76
)
73
- return series
77
+ return result
74
78
75
79
76
80
def json_extract (
77
- series : series .Series ,
81
+ input : series .Series ,
78
82
json_path : str ,
79
83
) -> series .Series :
80
84
"""Extracts a JSON value and converts it to a SQL JSON-formatted `STRING` or `JSON`
@@ -93,24 +97,24 @@ def json_extract(
93
97
dtype: string
94
98
95
99
Args:
96
- series (bigframes.series.Series):
100
+ input (bigframes.series.Series):
97
101
The Series containing JSON data (as native JSON objects or JSON-formatted strings).
98
102
json_path (str):
99
103
The JSON path identifying the data that you want to obtain from the input.
100
104
101
105
Returns:
102
106
bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING.
103
107
"""
104
- return series ._apply_unary_op (ops .JSONExtract (json_path = json_path ))
108
+ return input ._apply_unary_op (ops .JSONExtract (json_path = json_path ))
105
109
106
110
107
111
def json_extract_array (
108
- series : series .Series ,
112
+ input : series .Series ,
109
113
json_path : str = "$" ,
110
114
) -> series .Series :
111
- """Extracts a JSON array and converts it to a SQL array of JSON-formatted `STRING` or `JSON`
112
- values. This function uses single quotes and brackets to escape invalid JSONPath
113
- characters in JSON keys.
115
+ """Extracts a JSON array and converts it to a SQL array of JSON-formatted
116
+ `STRING` or `JSON` values. This function uses single quotes and brackets to
117
+ escape invalid JSONPath characters in JSON keys.
114
118
115
119
**Examples:**
116
120
@@ -124,13 +128,98 @@ def json_extract_array(
124
128
1 ['4' '5']
125
129
dtype: list<item: string>[pyarrow]
126
130
131
+ >>> s = bpd.Series([
132
+ ... '{"fruits": [{"name": "apple"}, {"name": "cherry"}]}',
133
+ ... '{"fruits": [{"name": "guava"}, {"name": "grapes"}]}'
134
+ ... ])
135
+ >>> bbq.json_extract_array(s, "$.fruits")
136
+ 0 ['{"name":"apple"}' '{"name":"cherry"}']
137
+ 1 ['{"name":"guava"}' '{"name":"grapes"}']
138
+ dtype: list<item: string>[pyarrow]
139
+
140
+ >>> s = bpd.Series([
141
+ ... '{"fruits": {"color": "red", "names": ["apple","cherry"]}}',
142
+ ... '{"fruits": {"color": "green", "names": ["guava", "grapes"]}}'
143
+ ... ])
144
+ >>> bbq.json_extract_array(s, "$.fruits.names")
145
+ 0 ['"apple"' '"cherry"']
146
+ 1 ['"guava"' '"grapes"']
147
+ dtype: list<item: string>[pyarrow]
148
+
127
149
Args:
128
- series (bigframes.series.Series):
150
+ input (bigframes.series.Series):
129
151
The Series containing JSON data (as native JSON objects or JSON-formatted strings).
130
152
json_path (str):
131
153
The JSON path identifying the data that you want to obtain from the input.
132
154
133
155
Returns:
134
- bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING .
156
+ bigframes.series.Series: A new Series with the parsed arrays from the input .
135
157
"""
136
- return series ._apply_unary_op (ops .JSONExtractArray (json_path = json_path ))
158
+ return input ._apply_unary_op (ops .JSONExtractArray (json_path = json_path ))
159
+
160
+
161
+ def json_extract_string_array (
162
+ input : series .Series ,
163
+ json_path : str = "$" ,
164
+ value_dtype : Optional [
165
+ Union [bigframes .dtypes .Dtype , bigframes .dtypes .DtypeString ]
166
+ ] = None ,
167
+ ) -> series .Series :
168
+ """Extracts a JSON array and converts it to a SQL array of `STRING` values.
169
+ A `value_dtype` can be provided to further coerce the data type of the
170
+ values in the array. This function uses single quotes and brackets to escape
171
+ invalid JSONPath characters in JSON keys.
172
+
173
+ **Examples:**
174
+
175
+ >>> import bigframes.pandas as bpd
176
+ >>> import bigframes.bigquery as bbq
177
+ >>> bpd.options.display.progress_bar = None
178
+
179
+ >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]'])
180
+ >>> bbq.json_extract_string_array(s)
181
+ 0 ['1' '2' '3']
182
+ 1 ['4' '5']
183
+ dtype: list<item: string>[pyarrow]
184
+
185
+ >>> bbq.json_extract_string_array(s, value_dtype='Int64')
186
+ 0 [1 2 3]
187
+ 1 [4 5]
188
+ dtype: list<item: int64>[pyarrow]
189
+
190
+ >>> s = bpd.Series([
191
+ ... '{"fruits": {"color": "red", "names": ["apple","cherry"]}}',
192
+ ... '{"fruits": {"color": "green", "names": ["guava", "grapes"]}}'
193
+ ... ])
194
+ >>> bbq.json_extract_string_array(s, "$.fruits.names")
195
+ 0 ['apple' 'cherry']
196
+ 1 ['guava' 'grapes']
197
+ dtype: list<item: string>[pyarrow]
198
+
199
+ Args:
200
+ input (bigframes.series.Series):
201
+ The Series containing JSON data (as native JSON objects or JSON-formatted strings).
202
+ json_path (str):
203
+ The JSON path identifying the data that you want to obtain from the input.
204
+ value_dtype (dtype, Optional):
205
+ The data type supported by BigFrames DataFrame.
206
+
207
+ Returns:
208
+ bigframes.series.Series: A new Series with the parsed arrays from the input.
209
+ """
210
+ array_series = input ._apply_unary_op (
211
+ ops .JSONExtractStringArray (json_path = json_path )
212
+ )
213
+ if value_dtype not in [None , bigframes .dtypes .STRING_DTYPE ]:
214
+ array_items_series = array_series .explode ()
215
+ if value_dtype == bigframes .dtypes .BOOL_DTYPE :
216
+ array_items_series = array_items_series .str .lower () == "true"
217
+ else :
218
+ array_items_series = array_items_series .astype (value_dtype )
219
+ array_series = cast (
220
+ series .Series ,
221
+ array .array_agg (
222
+ array_items_series .groupby (level = input .index .names , dropna = False )
223
+ ),
224
+ )
225
+ return array_series
0 commit comments