Skip to content

Commit 73afd0f

Browse files
fix(bigquery): Make JSONPathTokenizer more lenient for new standards (#4447)
* fix(bigquery): Make JSONPathTokenizer more lenient for new standards * Mutate attr instead of Generator flags * PR Feedback 1 * Switch to non-Tokenizer solution * Add comment to parse_var_text --------- Co-authored-by: George Sittas <[email protected]>
1 parent 954d8fd commit 73afd0f

File tree

4 files changed

+85
-14
lines changed

4 files changed

+85
-14
lines changed

sqlglot/dialects/bigquery.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,11 @@
4040
logger = logging.getLogger("sqlglot")
4141

4242

43+
JSON_EXTRACT_TYPE = t.Union[exp.JSONExtract, exp.JSONExtractScalar, exp.JSONExtractArray]
44+
45+
DQUOTES_ESCAPING_JSON_FUNCTIONS = ("JSON_QUERY", "JSON_VALUE", "JSON_QUERY_ARRAY")
46+
47+
4348
def _derived_table_values_to_unnest(self: BigQuery.Generator, expression: exp.Values) -> str:
4449
if not expression.find_ancestor(exp.From, exp.Join):
4550
return self.values_sql(expression)
@@ -324,6 +329,23 @@ def _build_contains_substring(args: t.List) -> exp.Contains | exp.Anonymous:
324329
return exp.Contains(this=this, expression=expr)
325330

326331

332+
def _json_extract_sql(self: BigQuery.Generator, expression: JSON_EXTRACT_TYPE) -> str:
333+
name = (expression._meta and expression.meta.get("name")) or expression.sql_name()
334+
upper = name.upper()
335+
336+
dquote_escaping = upper in DQUOTES_ESCAPING_JSON_FUNCTIONS
337+
338+
if dquote_escaping:
339+
self._quote_json_path_key_using_brackets = False
340+
341+
sql = rename_func(upper)(self, expression)
342+
343+
if dquote_escaping:
344+
self._quote_json_path_key_using_brackets = True
345+
346+
return sql
347+
348+
327349
class BigQuery(Dialect):
328350
WEEK_OFFSET = -1
329351
UNNEST_COLUMN_ONLY = True
@@ -869,6 +891,9 @@ class Generator(generator.Generator):
869891
exp.ILike: no_ilike_sql,
870892
exp.IntDiv: rename_func("DIV"),
871893
exp.Int64: rename_func("INT64"),
894+
exp.JSONExtract: _json_extract_sql,
895+
exp.JSONExtractArray: _json_extract_sql,
896+
exp.JSONExtractScalar: _json_extract_sql,
872897
exp.JSONFormat: rename_func("TO_JSON_STRING"),
873898
exp.Levenshtein: _levenshtein_sql,
874899
exp.Max: max_or_greatest,

sqlglot/generator.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -658,6 +658,7 @@ class Generator(metaclass=_Generator):
658658
"_next_name",
659659
"_identifier_start",
660660
"_identifier_end",
661+
"_quote_json_path_key_using_brackets",
661662
)
662663

663664
def __init__(
@@ -706,6 +707,8 @@ def __init__(
706707
self._identifier_start = self.dialect.IDENTIFIER_START
707708
self._identifier_end = self.dialect.IDENTIFIER_END
708709

710+
self._quote_json_path_key_using_brackets = True
711+
709712
def generate(self, expression: exp.Expression, copy: bool = True) -> str:
710713
"""
711714
Generates the SQL string corresponding to the given syntax tree.
@@ -2871,7 +2874,7 @@ def json_path_part(self, expression: int | str | exp.JSONPathPart) -> str:
28712874
if isinstance(expression, int):
28722875
return str(expression)
28732876

2874-
if self.JSON_PATH_SINGLE_QUOTE_ESCAPE:
2877+
if self._quote_json_path_key_using_brackets and self.JSON_PATH_SINGLE_QUOTE_ESCAPE:
28752878
escaped = expression.replace("'", "\\'")
28762879
escaped = f"\\'{expression}\\'"
28772880
else:
@@ -4072,7 +4075,11 @@ def _jsonpathkey_sql(self, expression: exp.JSONPathKey) -> str:
40724075
return f".{this}"
40734076

40744077
this = self.json_path_part(this)
4075-
return f"[{this}]" if self.JSON_PATH_BRACKETED_KEY_SUPPORTED else f".{this}"
4078+
return (
4079+
f"[{this}]"
4080+
if self._quote_json_path_key_using_brackets and self.JSON_PATH_BRACKETED_KEY_SUPPORTED
4081+
else f".{this}"
4082+
)
40764083

40774084
def _jsonpathsubscript_sql(self, expression: exp.JSONPathSubscript) -> str:
40784085
this = self.json_path_part(expression.this)

sqlglot/jsonpath.py

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,28 @@ def _parse_bracket() -> exp.JSONPathPart:
146146

147147
return node
148148

149+
def _parse_var_text() -> str:
150+
"""
151+
Consumes & returns the text for a var. In BigQuery it's valid to have a key with spaces
152+
in it, e.g JSON_QUERY(..., '$. a b c ') should produce a single JSONPathKey(' a b c ').
153+
This is done by merging "consecutive" vars until a key separator is found (dot, colon etc)
154+
or the path string is exhausted.
155+
"""
156+
prev_index = i - 2
157+
158+
while _match(TokenType.VAR):
159+
pass
160+
161+
start = 0 if prev_index < 0 else tokens[prev_index].end + 1
162+
163+
if i >= len(tokens):
164+
# This key is the last token for the path, so it's text is the remaining path
165+
text = path[start:]
166+
else:
167+
text = path[start : tokens[i].start]
168+
169+
return text
170+
149171
# We canonicalize the JSON path AST so that it always starts with a
150172
# "root" element, so paths like "field" will be generated as "$.field"
151173
_match(TokenType.DOLLAR)
@@ -155,8 +177,10 @@ def _parse_bracket() -> exp.JSONPathPart:
155177
if _match(TokenType.DOT) or _match(TokenType.COLON):
156178
recursive = _prev().text == ".."
157179

158-
if _match(TokenType.VAR) or _match(TokenType.IDENTIFIER):
159-
value: t.Optional[str | exp.JSONPathWildcard] = _prev().text
180+
if _match(TokenType.VAR):
181+
value: t.Optional[str | exp.JSONPathWildcard] = _parse_var_text()
182+
elif _match(TokenType.IDENTIFIER):
183+
value = _prev().text
160184
elif _match(TokenType.STAR):
161185
value = exp.JSONPathWildcard()
162186
else:
@@ -170,7 +194,9 @@ def _parse_bracket() -> exp.JSONPathPart:
170194
raise ParseError(_error("Expected key name or * after DOT"))
171195
elif _match(TokenType.L_BRACKET):
172196
expressions.append(_parse_bracket())
173-
elif _match(TokenType.VAR) or _match(TokenType.IDENTIFIER):
197+
elif _match(TokenType.VAR):
198+
expressions.append(exp.JSONPathKey(this=_parse_var_text()))
199+
elif _match(TokenType.IDENTIFIER):
174200
expressions.append(exp.JSONPathKey(this=_prev().text))
175201
elif _match(TokenType.STAR):
176202
expressions.append(exp.JSONPathWildcard())

tests/dialects/test_bigquery.py

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1574,14 +1574,6 @@ def test_bigquery(self):
15741574
"snowflake": "IFF((y) <> 0, (x) / (y), NULL)",
15751575
},
15761576
)
1577-
self.validate_all(
1578-
"""SELECT JSON_QUERY('{"class": {"students": []}}', '$.class')""",
1579-
write={
1580-
"bigquery": """SELECT JSON_QUERY('{"class": {"students": []}}', '$.class')""",
1581-
"duckdb": """SELECT '{"class": {"students": []}}' -> '$.class'""",
1582-
"snowflake": """SELECT GET_PATH(PARSE_JSON('{"class": {"students": []}}'), 'class')""",
1583-
},
1584-
)
15851577
self.validate_all(
15861578
"""SELECT JSON_VALUE_ARRAY('{"arr": [1, "a"]}', '$.arr')""",
15871579
write={
@@ -2139,7 +2131,16 @@ def test_null_ordering(self):
21392131
},
21402132
)
21412133

2142-
def test_json_extract_scalar(self):
2134+
def test_json_extract(self):
2135+
self.validate_all(
2136+
"""SELECT JSON_QUERY('{"class": {"students": []}}', '$.class')""",
2137+
write={
2138+
"bigquery": """SELECT JSON_QUERY('{"class": {"students": []}}', '$.class')""",
2139+
"duckdb": """SELECT '{"class": {"students": []}}' -> '$.class'""",
2140+
"snowflake": """SELECT GET_PATH(PARSE_JSON('{"class": {"students": []}}'), 'class')""",
2141+
},
2142+
)
2143+
21432144
for func in ("JSON_EXTRACT_SCALAR", "JSON_VALUE"):
21442145
with self.subTest(f"Testing BigQuery's {func}"):
21452146
self.validate_all(
@@ -2164,6 +2165,18 @@ def test_json_extract_scalar(self):
21642165
self.parse_one(sql).sql("bigquery", normalize_functions="upper"), sql
21652166
)
21662167

2168+
# Test double quote escaping
2169+
for func in ("JSON_VALUE", "JSON_QUERY", "JSON_QUERY_ARRAY"):
2170+
self.validate_identity(
2171+
f"{func}(doc, '$. a b c .d')", f"""{func}(doc, '$." a b c ".d')"""
2172+
)
2173+
2174+
# Test single quote & bracket escaping
2175+
for func in ("JSON_EXTRACT", "JSON_EXTRACT_SCALAR", "JSON_EXTRACT_ARRAY"):
2176+
self.validate_identity(
2177+
f"{func}(doc, '$. a b c .d')", f"""{func}(doc, '$[\\' a b c \\'].d')"""
2178+
)
2179+
21672180
def test_json_extract_array(self):
21682181
for func in ("JSON_QUERY_ARRAY", "JSON_EXTRACT_ARRAY"):
21692182
with self.subTest(f"Testing BigQuery's {func}"):

0 commit comments

Comments
 (0)