Skip to content

Commit 97c2599

Browse files
authored
SNOW-787480: fix json load encoding error (#1528)
1 parent 3d4f8f4 commit 97c2599

File tree

4 files changed

+65
-2
lines changed

4 files changed

+65
-2
lines changed

DESCRIPTION.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ Source code is also available at: https://github.com/snowflakedb/snowflake-conne
1010

1111
- v3.0.4(TBD)
1212
- Fixed a bug in which `cursor.execute()` could modify the argument statement_params dictionary object when executing a multistatement query.
13+
- Added the json_result_force_utf8_decoding connection parameter to force decoding JSON content in utf-8 when the result format is JSON.
1314
- Fixed a bug in which we cannot call `SnowflakeCursor.nextset` before fetching the result of the first query if the cursor runs an async multistatement query.
1415
- Bumped vendored library urllib3 to 1.26.15
1516
- Bumped vendored library requests to 2.29.0

src/snowflake/connector/connection.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,10 @@ def DefaultConverterClass() -> type:
204204
True,
205205
bool,
206206
), # Whether to log imported packages in telemetry
207+
"json_result_force_utf8_decoding": (
208+
False,
209+
bool,
210+
), # Whether to force the JSON content to be decoded in utf-8, it is only effective when result format is JSON
207211
}
208212

209213
APPLICATION_RE = re.compile(r"[\w\d_]+")
@@ -265,6 +269,9 @@ class SnowflakeConnection:
265269
enable_connection_diag: when true, clients will generate a connectivity diagnostic report.
266270
connection_diag_log_path: path to location to create diag report with enable_connection_diag.
267271
connection_diag_whitelist_path: path to a whitelist.json file to test with enable_connection_diag.
272+
json_result_force_utf8_decoding: When true, json result will be decoded in utf-8,
273+
when false, the encoding of the content is auto-detected. Default value is false.
274+
This parameter is only effective when the result format is JSON.
268275
"""
269276

270277
OCSP_ENV_LOCK = Lock()

src/snowflake/connector/result_batch.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ def remote_chunk_info(c: dict[str, Any]) -> RemoteChunkInfo:
132132
schema,
133133
column_converters,
134134
cursor._use_dict_result,
135+
json_result_force_utf8_decoding=cursor._connection._json_result_force_utf8_decoding,
135136
)
136137
for c in chunks
137138
]
@@ -384,6 +385,8 @@ def __init__(
384385
schema: Sequence[ResultMetadata],
385386
column_converters: Sequence[tuple[str, SnowflakeConverterType]],
386387
use_dict_result: bool,
388+
*,
389+
json_result_force_utf8_decoding: bool = False,
387390
) -> None:
388391
super().__init__(
389392
rowcount,
@@ -392,6 +395,7 @@ def __init__(
392395
schema,
393396
use_dict_result,
394397
)
398+
self._json_result_force_utf8_decoding = json_result_force_utf8_decoding
395399
self.column_converters = column_converters
396400

397401
@classmethod
@@ -420,10 +424,21 @@ def _load(self, response: Response) -> list:
420424
421425
Returns:
422426
Whatever ``json.loads`` return, but in a list.
423-
Unfortunately there's not type hint for this.
427+
Unfortunately there's no type hint for this.
424428
For context: https://github.com/python/typing/issues/182
425429
"""
426-
read_data = response.text
430+
# if users specify how to decode the data, we decode the bytes using the specified encoding
431+
if self._json_result_force_utf8_decoding:
432+
try:
433+
read_data = str(response.content, "utf-8", errors="strict")
434+
except Exception as exc:
435+
err_msg = f"failed to decode json result content due to error {exc!r}"
436+
logger.error(err_msg)
437+
raise Error(msg=err_msg)
438+
else:
439+
# note: SNOW-787480 response.apparent_encoding is unreliable, chardet.detect can be wrong which is used by
440+
# response.text to decode content, check issue: https://github.com/chardet/chardet/issues/148
441+
read_data = response.text
427442
return json.loads("".join(["[", read_data, "]"]))
428443

429444
def _parse(

test/integ/test_cursor.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ class ResultMetadata(NamedTuple):
5151
ER_FAILED_TO_REWRITE_MULTI_ROW_INSERT,
5252
ER_NOT_POSITIVE_SIZE,
5353
)
54+
from snowflake.connector.errors import Error
5455
from snowflake.connector.sqlstate import SQLSTATE_FEATURE_NOT_SUPPORTED
5556
from snowflake.connector.telemetry import TelemetryField
5657

@@ -1618,3 +1619,42 @@ def test_multi_statement_failure(conn_cnx):
16181619
CLIENT_VERSION,
16191620
(type(None), str),
16201621
)
1622+
1623+
1624+
@pytest.mark.skipolddriver
1625+
def test_decoding_utf8_for_json_result(conn_cnx):
1626+
# SNOW-787480, if not explicitly setting utf-8 decoding, the data will be
1627+
# detected decoding as windows-1250 by chardet.detect
1628+
with conn_cnx(
1629+
session_parameters={"python_connector_query_result_format": "JSON"}
1630+
) as con, con.cursor() as cur:
1631+
sql = """select '"",' || '"",' || '"",' || '"",' || '"",' || 'Ofigràfic' || '"",' from TABLE(GENERATOR(ROWCOUNT => 5000)) v;"""
1632+
ret = cur.execute(sql).fetchall()
1633+
assert len(ret) == 5000
1634+
# This test case is tricky, for most of the test cases, the decoding is incorrect and can could be different
1635+
# on different platforms, however, due to randomness, in rare cases the decoding is indeed utf-8,
1636+
# the backend behavior is flaky
1637+
assert ret[0] in (
1638+
('"","","","","",OfigrĂ\xa0fic"",',), # AWS Cloud
1639+
('"","","","","",OfigrÃ\xa0fic"",',), # GCP Mac and Linux Cloud
1640+
('"","","","","",Ofigr\xc3\\xa0fic"",',), # GCP Windows Cloud
1641+
(
1642+
'"","","","","",Ofigràfic"",',
1643+
), # regression environment gets the correct decoding
1644+
)
1645+
1646+
with conn_cnx(
1647+
session_parameters={"python_connector_query_result_format": "JSON"},
1648+
json_result_force_utf8_decoding=True,
1649+
) as con, con.cursor() as cur:
1650+
ret = cur.execute(sql).fetchall()
1651+
assert len(ret) == 5000
1652+
assert ret[0] == ('"","","","","",Ofigràfic"",',)
1653+
1654+
result_batch = JSONResultBatch(
1655+
None, None, None, None, None, False, json_result_force_utf8_decoding=True
1656+
)
1657+
mock_resp = mock.Mock()
1658+
mock_resp.content = "À".encode("latin1")
1659+
with pytest.raises(Error):
1660+
result_batch._load(mock_resp)

0 commit comments

Comments
 (0)