From 9ae837f8437b0a038cb00a375a650464a27e242a Mon Sep 17 00:00:00 2001 From: Deepak Yadav Date: Fri, 20 Mar 2026 08:02:35 -0700 Subject: [PATCH 1/4] fix: Propagate INFER_SCHEMA errors for semi-structured file reads When INFER_SCHEMA fails for JSON/ORC/AVRO files (e.g. malformed content triggering error 100069), the exception was silently swallowed, leaving INFER_SCHEMA=True with no schema_to_cast. This caused a downstream AssertionError with no message, hiding the actual failure from users. Now when inference fails: FileNotFoundError is re-raised directly, other errors log a warning and set INFER_SCHEMA=False so the read falls back to a single VARIANT column (matching existing CSV fallback behavior). The real Snowflake error surfaces when the data is actually queried. Made-with: Cursor --- src/snowflake/snowpark/dataframe_reader.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/snowflake/snowpark/dataframe_reader.py b/src/snowflake/snowpark/dataframe_reader.py index 2d00d631c9..d45b6d9f32 100644 --- a/src/snowflake/snowpark/dataframe_reader.py +++ b/src/snowflake/snowpark/dataframe_reader.py @@ -1415,10 +1415,20 @@ def _read_semi_structured_file(self, path: str, format: str) -> DataFrame: new_schema, schema_to_cast, read_file_transformations, - _, # we don't check for error in case of infer schema failures. We use $1, Variant type + infer_schema_exception, ) = self._infer_schema_for_file_format(path, format) if new_schema: schema = new_schema + elif infer_schema_exception is not None: + if isinstance(infer_schema_exception, FileNotFoundError): + raise infer_schema_exception + logger.warning( + f"Could not infer schema for {format} file due to exception: " + f"{infer_schema_exception}. " + "\nFalling back to $1 VARIANT schema. " + "Please use DataFrameReader.schema() to specify a user schema for the file." + ) + self._cur_options["INFER_SCHEMA"] = False metadata_project, metadata_schema = self._get_metadata_project_and_schema() From ceafc8090fd7cb780cbfa0183409af12efb6d651 Mon Sep 17 00:00:00 2001 From: Deepak Yadav Date: Fri, 20 Mar 2026 08:10:48 -0700 Subject: [PATCH 2/4] test: Add unit tests for semi-structured file infer schema failure handling Cover three scenarios for JSON, AVRO, ORC, and Parquet formats: - Generic error: logs warning and falls back to VARIANT schema - FileNotFoundError: re-raised directly to the caller - Successful inference: no warning logged, normal behavior Made-with: Cursor --- tests/unit/test_dataframe.py | 92 ++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/tests/unit/test_dataframe.py b/tests/unit/test_dataframe.py index e4e5d95d24..4d3e94f5f2 100644 --- a/tests/unit/test_dataframe.py +++ b/tests/unit/test_dataframe.py @@ -2,6 +2,7 @@ # # Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved. # +import logging import re from unittest import mock @@ -143,6 +144,97 @@ def nop(name): ) +def _create_fake_session(sql_simplifier_enabled=True): + """Build a minimal fake session suitable for DataFrameReader unit tests.""" + + def nop(name): + return name + + fake_session = mock.create_autospec(snowflake.snowpark.session.Session) + fake_session.sql_simplifier_enabled = sql_simplifier_enabled + fake_session._cte_optimization_enabled = False + fake_session._query_compilation_stage_enabled = False + fake_session._join_alias_fix = False + fake_session._conn = mock.create_autospec(ServerConnection) + fake_session._conn._thread_safe_session_enabled = True + fake_session._plan_builder = SnowflakePlanBuilder(fake_session) + fake_session._analyzer = Analyzer(fake_session) + fake_session._use_scoped_temp_objects = True + fake_session._ast_batch = mock.create_autospec(AstBatch) + fake_session.get_fully_qualified_name_if_possible = nop + return fake_session + + +@pytest.mark.parametrize("format_type", ["json", "avro", "orc", "parquet"]) +def test_read_semi_structured_infer_schema_generic_error(format_type, caplog): + """When _infer_schema_for_file_format returns a non-FileNotFoundError, + the reader should log a warning, set INFER_SCHEMA=False, and still + return a DataFrame with the $1 VARIANT fallback schema.""" + error = RuntimeError("Cannot infer schema: error 100069") + + def mock_infer(*args, **kwargs): + return None, None, None, error + + fake_session = _create_fake_session() + reader = DataFrameReader(fake_session).option("INFER_SCHEMA", True) + + with mock.patch( + "snowflake.snowpark.dataframe_reader.DataFrameReader._infer_schema_for_file_format", + mock_infer, + ): + with caplog.at_level(logging.WARNING): + df = getattr(reader, format_type)("@stage/file") + + assert df is not None + assert f"Could not infer schema for {format_type.upper()} file" in caplog.text + assert "100069" in caplog.text + assert "Falling back to $1 VARIANT schema" in caplog.text + + +@pytest.mark.parametrize("format_type", ["json", "avro", "orc", "parquet"]) +def test_read_semi_structured_infer_schema_file_not_found(format_type): + """When _infer_schema_for_file_format returns a FileNotFoundError, + the reader should re-raise it directly.""" + error = FileNotFoundError("Stage path does not exist or not authorized") + + def mock_infer(*args, **kwargs): + return None, None, None, error + + fake_session = _create_fake_session() + reader = DataFrameReader(fake_session).option("INFER_SCHEMA", True) + + with mock.patch( + "snowflake.snowpark.dataframe_reader.DataFrameReader._infer_schema_for_file_format", + mock_infer, + ): + with pytest.raises(FileNotFoundError, match="not authorized"): + getattr(reader, format_type)("@stage/file") + + +@pytest.mark.parametrize("format_type", ["json", "avro", "orc", "parquet"]) +def test_read_semi_structured_infer_schema_success_no_warning(format_type, caplog): + """When _infer_schema_for_file_format succeeds, no warning should be logged + and INFER_SCHEMA should remain True.""" + schema = [Attribute('"col1"', StringType())] + schema_to_cast = [("$1:col1::VARCHAR", "col1")] + + def mock_infer(*args, **kwargs): + return schema, schema_to_cast, None, None + + fake_session = _create_fake_session() + reader = DataFrameReader(fake_session).option("INFER_SCHEMA", True) + + with mock.patch( + "snowflake.snowpark.dataframe_reader.DataFrameReader._infer_schema_for_file_format", + mock_infer, + ): + with caplog.at_level(logging.WARNING): + df = getattr(reader, format_type)("@stage/file") + + assert df is not None + assert "Could not infer schema" not in caplog.text + + def test_select_negative(): AST_ENABLED = False set_ast_state(AstFlagSource.TEST, AST_ENABLED) From 1f3e88c408ec6ba737de71c9e593cb40dce21c05 Mon Sep 17 00:00:00 2001 From: Deepak Yadav Date: Wed, 25 Mar 2026 15:37:40 -0700 Subject: [PATCH 3/4] changes --- src/snowflake/snowpark/dataframe_reader.py | 37 +++++++++++----------- tests/integ/modin/io/test_read_json.py | 8 ++--- 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/src/snowflake/snowpark/dataframe_reader.py b/src/snowflake/snowpark/dataframe_reader.py index 8364c54f80..949b6c6624 100644 --- a/src/snowflake/snowpark/dataframe_reader.py +++ b/src/snowflake/snowpark/dataframe_reader.py @@ -1624,24 +1624,25 @@ def _read_semi_structured_file(self, path: str, format: str) -> DataFrame: use_user_schema = True elif self._infer_schema: - ( - new_schema, - schema_to_cast, - read_file_transformations, - infer_schema_exception, - ) = self._infer_schema_for_file_format(path, format) - if new_schema: - schema = new_schema - elif infer_schema_exception is not None: - if isinstance(infer_schema_exception, FileNotFoundError): - raise infer_schema_exception - logger.warning( - f"Could not infer schema for {format} file due to exception: " - f"{infer_schema_exception}. " - "\nFalling back to $1 VARIANT schema. " - "Please use DataFrameReader.schema() to specify a user schema for the file." - ) - self._cur_options["INFER_SCHEMA"] = False + if not isinstance(self._session._conn, MockServerConnection): + ( + new_schema, + schema_to_cast, + read_file_transformations, + infer_schema_exception, + ) = self._infer_schema_for_file_format(path, format) + if new_schema: + schema = new_schema + elif infer_schema_exception is not None: + if isinstance(infer_schema_exception, FileNotFoundError): + raise infer_schema_exception + logger.warning( + f"Could not infer schema for {format} file due to exception: " + f"{infer_schema_exception}. " + "\nFalling back to $1 VARIANT schema. " + "Please use DataFrameReader.schema() to specify a user schema for the file." + ) + self._cur_options["INFER_SCHEMA"] = False metadata_project, metadata_schema = self._get_metadata_project_and_schema() diff --git a/tests/integ/modin/io/test_read_json.py b/tests/integ/modin/io/test_read_json.py index bc3879d961..2dc05319f1 100644 --- a/tests/integ/modin/io/test_read_json.py +++ b/tests/integ/modin/io/test_read_json.py @@ -16,6 +16,7 @@ import pytest import snowflake.snowpark.modin.plugin # noqa: F401 +from snowflake.snowpark.exceptions import SnowparkSQLException from tests.integ.modin.utils import assert_frame_equal from tests.integ.utils.sql_counter import SqlCounter, sql_count_checker from tests.utils import Utils @@ -294,9 +295,6 @@ def test_read_json_staged_folder(): @sql_count_checker(query_count=4) -@pytest.mark.xfail( - reason="SNOW-1336174: Remove xfail by handling empty JSON files", strict=True -) def test_read_json_empty_file(): with open("test_read_json_empty_file.json", "w"): pass @@ -307,13 +305,13 @@ def test_read_json_empty_file(): os.remove("test_read_json_empty_file.json") -@sql_count_checker(query_count=3) +@sql_count_checker(query_count=4) def test_read_json_malformed_file_negative(): with open("test_read_json_malformed_file.json", "w") as f: f.write("{a: 3, key_no_value}") - with pytest.raises(AssertionError): + with pytest.raises(SnowparkSQLException): pd.read_json("test_read_json_malformed_file.json") os.remove("test_read_json_malformed_file.json") From ba3a6e32e85ca2719edfd58cf0395ff86eb28390 Mon Sep 17 00:00:00 2001 From: Deepak Yadav Date: Wed, 25 Mar 2026 17:17:56 -0700 Subject: [PATCH 4/4] changes --- tests/integ/modin/io/test_read_json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integ/modin/io/test_read_json.py b/tests/integ/modin/io/test_read_json.py index 2dc05319f1..0bb995ef84 100644 --- a/tests/integ/modin/io/test_read_json.py +++ b/tests/integ/modin/io/test_read_json.py @@ -305,7 +305,7 @@ def test_read_json_empty_file(): os.remove("test_read_json_empty_file.json") -@sql_count_checker(query_count=4) +@sql_count_checker(query_count=5) def test_read_json_malformed_file_negative(): with open("test_read_json_malformed_file.json", "w") as f: