|
2 | 2 | # |
3 | 3 | # Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved. |
4 | 4 | # |
| 5 | +import logging |
5 | 6 | import re |
6 | 7 | from unittest import mock |
7 | 8 |
|
@@ -143,6 +144,97 @@ def nop(name): |
143 | 144 | ) |
144 | 145 |
|
145 | 146 |
|
| 147 | +def _create_fake_session(sql_simplifier_enabled=True): |
| 148 | + """Build a minimal fake session suitable for DataFrameReader unit tests.""" |
| 149 | + |
| 150 | + def nop(name): |
| 151 | + return name |
| 152 | + |
| 153 | + fake_session = mock.create_autospec(snowflake.snowpark.session.Session) |
| 154 | + fake_session.sql_simplifier_enabled = sql_simplifier_enabled |
| 155 | + fake_session._cte_optimization_enabled = False |
| 156 | + fake_session._query_compilation_stage_enabled = False |
| 157 | + fake_session._join_alias_fix = False |
| 158 | + fake_session._conn = mock.create_autospec(ServerConnection) |
| 159 | + fake_session._conn._thread_safe_session_enabled = True |
| 160 | + fake_session._plan_builder = SnowflakePlanBuilder(fake_session) |
| 161 | + fake_session._analyzer = Analyzer(fake_session) |
| 162 | + fake_session._use_scoped_temp_objects = True |
| 163 | + fake_session._ast_batch = mock.create_autospec(AstBatch) |
| 164 | + fake_session.get_fully_qualified_name_if_possible = nop |
| 165 | + return fake_session |
| 166 | + |
| 167 | + |
| 168 | +@pytest.mark.parametrize("format_type", ["json", "avro", "orc", "parquet"]) |
| 169 | +def test_read_semi_structured_infer_schema_generic_error(format_type, caplog): |
| 170 | + """When _infer_schema_for_file_format returns a non-FileNotFoundError, |
| 171 | + the reader should log a warning, set INFER_SCHEMA=False, and still |
| 172 | + return a DataFrame with the $1 VARIANT fallback schema.""" |
| 173 | + error = RuntimeError("Cannot infer schema: error 100069") |
| 174 | + |
| 175 | + def mock_infer(*args, **kwargs): |
| 176 | + return None, None, None, error |
| 177 | + |
| 178 | + fake_session = _create_fake_session() |
| 179 | + reader = DataFrameReader(fake_session).option("INFER_SCHEMA", True) |
| 180 | + |
| 181 | + with mock.patch( |
| 182 | + "snowflake.snowpark.dataframe_reader.DataFrameReader._infer_schema_for_file_format", |
| 183 | + mock_infer, |
| 184 | + ): |
| 185 | + with caplog.at_level(logging.WARNING): |
| 186 | + df = getattr(reader, format_type)("@stage/file") |
| 187 | + |
| 188 | + assert df is not None |
| 189 | + assert f"Could not infer schema for {format_type.upper()} file" in caplog.text |
| 190 | + assert "100069" in caplog.text |
| 191 | + assert "Falling back to $1 VARIANT schema" in caplog.text |
| 192 | + |
| 193 | + |
| 194 | +@pytest.mark.parametrize("format_type", ["json", "avro", "orc", "parquet"]) |
| 195 | +def test_read_semi_structured_infer_schema_file_not_found(format_type): |
| 196 | + """When _infer_schema_for_file_format returns a FileNotFoundError, |
| 197 | + the reader should re-raise it directly.""" |
| 198 | + error = FileNotFoundError("Stage path does not exist or not authorized") |
| 199 | + |
| 200 | + def mock_infer(*args, **kwargs): |
| 201 | + return None, None, None, error |
| 202 | + |
| 203 | + fake_session = _create_fake_session() |
| 204 | + reader = DataFrameReader(fake_session).option("INFER_SCHEMA", True) |
| 205 | + |
| 206 | + with mock.patch( |
| 207 | + "snowflake.snowpark.dataframe_reader.DataFrameReader._infer_schema_for_file_format", |
| 208 | + mock_infer, |
| 209 | + ): |
| 210 | + with pytest.raises(FileNotFoundError, match="not authorized"): |
| 211 | + getattr(reader, format_type)("@stage/file") |
| 212 | + |
| 213 | + |
| 214 | +@pytest.mark.parametrize("format_type", ["json", "avro", "orc", "parquet"]) |
| 215 | +def test_read_semi_structured_infer_schema_success_no_warning(format_type, caplog): |
| 216 | + """When _infer_schema_for_file_format succeeds, no warning should be logged |
| 217 | + and INFER_SCHEMA should remain True.""" |
| 218 | + schema = [Attribute('"col1"', StringType())] |
| 219 | + schema_to_cast = [("$1:col1::VARCHAR", "col1")] |
| 220 | + |
| 221 | + def mock_infer(*args, **kwargs): |
| 222 | + return schema, schema_to_cast, None, None |
| 223 | + |
| 224 | + fake_session = _create_fake_session() |
| 225 | + reader = DataFrameReader(fake_session).option("INFER_SCHEMA", True) |
| 226 | + |
| 227 | + with mock.patch( |
| 228 | + "snowflake.snowpark.dataframe_reader.DataFrameReader._infer_schema_for_file_format", |
| 229 | + mock_infer, |
| 230 | + ): |
| 231 | + with caplog.at_level(logging.WARNING): |
| 232 | + df = getattr(reader, format_type)("@stage/file") |
| 233 | + |
| 234 | + assert df is not None |
| 235 | + assert "Could not infer schema" not in caplog.text |
| 236 | + |
| 237 | + |
146 | 238 | def test_select_negative(): |
147 | 239 | AST_ENABLED = False |
148 | 240 | set_ast_state(AstFlagSource.TEST, AST_ENABLED) |
|
0 commit comments