Skip to content

Commit f1d9914

Browse files
authored
SNOW-1875145: Support querying json element in col (#2875)
<!--- Please answer these questions before creating your pull request. Thanks! ---> 1. Which Jira issue is this PR addressing? Make sure that there is an accompanying issue to your PR. <!--- In this section, please add a Snowflake Jira issue number. Note that if a corresponding GitHub issue exists, you should still include the Snowflake Jira issue number. For example, for GitHub issue #1400, you should add "SNOW-1335071" here. ---> Fixes SNOW-1875145 2. Fill out the following pre-review checklist: - [x] I am adding a new automated test(s) to verify correctness of my new code - [ ] If this test skips Local Testing mode, I'm requesting review from @snowflakedb/local-testing - [ ] I am adding new logging messages - [ ] I am adding a new telemetry message - [ ] I am adding new credentials - [ ] I am adding a new dependency - [ ] If this is a new feature/behavior, I'm adding the Local Testing parity changes. - [x] I acknowledge that I have ensured my changes to be thread-safe. Follow the link for more information: [Thread-safe Developer Guidelines](https://github.com/snowflakedb/snowpark-python/blob/main/CONTRIBUTING.md#thread-safe-development) 3. Please describe how your code solves the related issue. Please write a short description of how your code change solves the related issue.
1 parent f801bd0 commit f1d9914

File tree

4 files changed

+121
-15
lines changed

4 files changed

+121
-15
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
#### Experimental Features
4848

4949
- Added `Catalog` class to manage snowflake objects. It can be accessed via `Session.catalog`.
50+
- Added support for querying json element of a VARIANT column in `functions.col` and `functions.column` with an optional keyword argument `json_element`.
5051
- Allow user input schema when reading JSON file on stage.
5152
- Added support for specifying a schema string (including implicit struct syntax) when calling `DataFrame.create_dataframe`.
5253
- `snowflake.core` is a dependency required for this feature.

src/snowflake/snowpark/column.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -252,11 +252,27 @@ def __init__(
252252
self,
253253
expr1: Union[str, Expression],
254254
expr2: Optional[str] = None,
255+
json_element: bool = False,
255256
_ast: Optional[proto.Expr] = None,
256257
_emit_ast: bool = True,
257258
) -> None:
258259
self._ast = _ast
259260

261+
def derive_json_element_expr(
262+
expr: str, df_alias: Optional[str] = None
263+
) -> UnresolvedAttribute:
264+
parts = expr.split(".")
265+
if len(parts) == 1:
266+
return UnresolvedAttribute(quote_name(parts[0]), df_alias=df_alias)
267+
else:
268+
# According to https://docs.snowflake.com/en/user-guide/querying-semistructured#dot-notation,
269+
# the json value on the path should be case-sensitive
270+
return UnresolvedAttribute(
271+
f"{quote_name(parts[0])}:{'.'.join(quote_name(part, keep_case=True) for part in parts[1:])}",
272+
is_sql_text=True,
273+
df_alias=df_alias,
274+
)
275+
260276
if expr2 is not None:
261277
if not (isinstance(expr1, str) and isinstance(expr2, str)):
262278
raise ValueError(
@@ -265,6 +281,8 @@ def __init__(
265281

266282
if expr2 == "*":
267283
self._expression = Star([], df_alias=expr1)
284+
elif json_element:
285+
self._expression = derive_json_element_expr(expr2, expr1)
268286
else:
269287
self._expression = UnresolvedAttribute(
270288
quote_name(expr2), df_alias=expr1
@@ -279,6 +297,8 @@ def __init__(
279297
elif isinstance(expr1, str):
280298
if expr1 == "*":
281299
self._expression = Star([])
300+
elif json_element:
301+
self._expression = derive_json_element_expr(expr1)
282302
else:
283303
self._expression = UnresolvedAttribute(quote_name(expr1))
284304

@@ -1483,9 +1503,15 @@ class CaseExpr(Column):
14831503
"""
14841504

14851505
def __init__(
1486-
self, expr: CaseWhen, _ast: Optional[proto.Expr] = None, _emit_ast: bool = True
1506+
self,
1507+
expr: CaseWhen,
1508+
json_element: bool = False,
1509+
_ast: Optional[proto.Expr] = None,
1510+
_emit_ast: bool = True,
14871511
) -> None:
1488-
super().__init__(expr, _ast=_ast, _emit_ast=_emit_ast)
1512+
super().__init__(
1513+
expr, json_element=json_element, _ast=_ast, _emit_ast=_emit_ast
1514+
)
14891515
self._branches = expr.branches
14901516

14911517
@publicapi

src/snowflake/snowpark/functions.py

Lines changed: 45 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -262,20 +262,31 @@ def _check_column_parameters(name1: str, name2: Optional[str]) -> None:
262262

263263
@overload
264264
@publicapi
265-
def col(col_name: str, _emit_ast: bool = True) -> Column:
265+
def col(col_name: str, json_element: bool = False, _emit_ast: bool = True) -> Column:
266266
"""Returns the :class:`~snowflake.snowpark.Column` with the specified name.
267267
268+
Args:
269+
col_name: The name of the column.
270+
json_element: Whether the column is a JSON element. If a column is a VARIANT column in Snowflake,
271+
you can dot notation `.` to query the nested json element, e.g., "name.firstname" and "name.lastname".
272+
268273
Example::
269274
>>> df = session.sql("select 1 as a")
270275
>>> df.select(col("a")).collect()
271276
[Row(A=1)]
277+
278+
>>> df = session.sql("select parse_json('{\"firstname\": \"John\", \"lastname\": \"Doe\"}') as name")
279+
>>> df.select(col("name.firstname", json_element=True)).collect()
280+
[Row(FIRSTNAME='John')]
272281
"""
273282
... # pragma: no cover
274283

275284

276285
@overload
277286
@publicapi
278-
def col(df_alias: str, col_name: str, _emit_ast: bool = True) -> Column:
287+
def col(
288+
df_alias: str, col_name: str, json_element: bool = False, _emit_ast: bool = True
289+
) -> Column:
279290
"""Returns the :class:`~snowflake.snowpark.Column` with the specified dataframe alias and column name.
280291
281292
Example::
@@ -287,7 +298,12 @@ def col(df_alias: str, col_name: str, _emit_ast: bool = True) -> Column:
287298

288299

289300
@publicapi
290-
def col(name1: str, name2: Optional[str] = None, _emit_ast: bool = True) -> Column:
301+
def col(
302+
name1: str,
303+
name2: Optional[str] = None,
304+
json_element: bool = False,
305+
_emit_ast: bool = True,
306+
) -> Column:
291307

292308
_check_column_parameters(name1, name2)
293309

@@ -296,27 +312,38 @@ def col(name1: str, name2: Optional[str] = None, _emit_ast: bool = True) -> Colu
296312
ast = create_ast_for_column(name1, name2, "col")
297313

298314
if name2 is None:
299-
return Column(name1, _ast=ast)
315+
return Column(name1, json_element=json_element, _ast=ast)
300316
else:
301-
return Column(name1, name2, _ast=ast)
317+
return Column(name1, name2, json_element=json_element, _ast=ast)
302318

303319

304320
@overload
305321
@publicapi
306-
def column(col_name: str, _emit_ast: bool = True) -> Column:
322+
def column(col_name: str, json_element: bool = False, _emit_ast: bool = True) -> Column:
307323
"""Returns a :class:`~snowflake.snowpark.Column` with the specified name. Alias for col.
308324
325+
Args:
326+
col_name: The name of the column.
327+
json_element: Whether the column is a JSON element. If a column is a VARIANT column in Snowflake,
328+
you can dot notation `.` to query the nested json element, e.g., "name.firstname" and "name.lastname".
329+
309330
Example::
310-
>>> df = session.sql("select 1 as a")
311-
>>> df.select(column("a")).collect()
312-
[Row(A=1)]
331+
>>> df = session.sql("select 1 as a")
332+
>>> df.select(column("a")).collect()
333+
[Row(A=1)]
334+
335+
>>> df = session.sql("select parse_json('{\"firstname\": \"John\", \"lastname\": \"Doe\"}') as name")
336+
>>> df.select(column("name.firstname", json_element=True)).collect()
337+
[Row(FIRSTNAME='John')]
313338
"""
314339
... # pragma: no cover
315340

316341

317342
@overload
318343
@publicapi
319-
def column(df_alias: str, col_name: str, _emit_ast: bool = True) -> Column:
344+
def column(
345+
df_alias: str, col_name: str, json_element: bool = False, _emit_ast: bool = True
346+
) -> Column:
320347
"""Returns a :class:`~snowflake.snowpark.Column` with the specified name and dataframe alias name. Alias for col.
321348
322349
Example::
@@ -328,15 +355,20 @@ def column(df_alias: str, col_name: str, _emit_ast: bool = True) -> Column:
328355

329356

330357
@publicapi
331-
def column(name1: str, name2: Optional[str] = None, _emit_ast: bool = True) -> Column:
358+
def column(
359+
name1: str,
360+
name2: Optional[str] = None,
361+
json_element: bool = False,
362+
_emit_ast: bool = True,
363+
) -> Column:
332364
_check_column_parameters(name1, name2)
333365

334366
ast = create_ast_for_column(name1, name2, "column") if _emit_ast else None
335367

336368
if name2 is None:
337-
return Column(name1, _ast=ast)
369+
return Column(name1, json_element=json_element, _ast=ast)
338370
else:
339-
return Column(name1, name2, _ast=ast)
371+
return Column(name1, name2, json_element=json_element, _ast=ast)
340372

341373

342374
@publicapi

tests/integ/test_function.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,53 @@
185185
)
186186

187187

188+
@pytest.mark.skipif(
189+
"config.getoption('local_testing_mode', default=False)",
190+
reason="querying json element is not supported in local testing",
191+
)
192+
def test_col_json_element(session):
193+
# 2-level deep
194+
df = session.sql(
195+
'select parse_json(\'{"firstname": "John", "lastname": "Doe"}\') as name'
196+
)
197+
Utils.check_answer(
198+
df.select(
199+
col("name.firstname", json_element=True),
200+
col("name.lastname", json_element=True),
201+
),
202+
[Row('"John"', '"Doe"')],
203+
)
204+
Utils.check_answer(
205+
df.select(
206+
col('name."firstname"', json_element=True),
207+
col('NAME."lastname"', json_element=True),
208+
),
209+
[Row('"John"', '"Doe"')],
210+
)
211+
Utils.check_answer(df.select(col("name.FIRSTNAME", json_element=True)), [Row(None)])
212+
213+
# 3-level deep
214+
with pytest.raises(SnowparkSQLException, match="invalid identifier"):
215+
df.select(col("name:firstname", json_element=True)).collect()
216+
217+
with pytest.raises(SnowparkSQLException, match="invalid identifier"):
218+
df.select(col("name.firstname")).collect()
219+
220+
df = session.sql('select parse_json(\'{"l1": {"l2": "xyz"}}\') as value')
221+
Utils.check_answer(df.select(col("value.l1.l2", json_element=True)), Row('"xyz"'))
222+
Utils.check_answer(
223+
df.select(col('value."l1"."l2"', json_element=True)), Row('"xyz"')
224+
)
225+
Utils.check_answer(df.select(col("value.L1.l2", json_element=True)), Row(None))
226+
Utils.check_answer(df.select(col("value.l1.L2", json_element=True)), Row(None))
227+
228+
with pytest.raises(SnowparkSQLException, match="invalid identifier"):
229+
df.select(col("value:l1.l2", json_element=True)).collect()
230+
231+
with pytest.raises(SnowparkSQLException, match="invalid identifier"):
232+
df.select(col("value.l1.l2")).collect()
233+
234+
188235
def test_order(session):
189236
null_data1 = TestData.null_data1(session)
190237
assert null_data1.sort(asc(null_data1["A"])).collect() == [

0 commit comments

Comments
 (0)