SNOW-2367027: address behavioral gap between udtf and parquet approach (#3848)

sfc-gh-aling · web-flow · commit d5b232d97bcb · 2025-10-20T13:07:06.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -67,6 +67,10 @@
 - Fixed a bug where writing Snowpark pandas dataframes on the pandas backend with a column multiindex to Snowflake with `to_snowflake` would raise `KeyError`.
 - Fixed a bug that `DataFrameReader.dbapi` (PuPr) is not compatible with oracledb 3.4.0.
 
+#### Improvements
+
+- The default maximum length for inferred StringType columns during schema inference in `DataFrameReader.dbapi` is now increased from 16MB to 128MB in parquet file based ingestion.
+
 #### Dependency Updates
 
 - Updated dependency of `snowflake-connector-python>=3.17,<5.0.0`.
diff --git a/src/snowflake/snowpark/_internal/data_source/drivers/base_driver.py b/src/snowflake/snowpark/_internal/data_source/drivers/base_driver.py
@@ -11,6 +11,7 @@
     Connection,
     Cursor,
 )
+from snowflake.snowpark._internal.server_connection import MAX_STRING_SIZE
 from snowflake.snowpark._internal.utils import (
     get_sorted_key_for_version,
     measure_time,
@@ -27,6 +28,7 @@
     BinaryType,
     DateType,
     BooleanType,
+    StringType,
 )
 import snowflake.snowpark
 import logging
@@ -103,7 +105,16 @@ def infer_schema_from_description(
         query_input_alias: str,
     ) -> StructType:
         self.get_raw_schema(table_or_query, cursor, is_query, query_input_alias)
-        return self.to_snow_type(self.raw_schema)
+        generated_schema = self.to_snow_type(self.raw_schema)
+        # snowflake will default string length to 128MB in the bundle which will be enabled in 2026-01
+        # https://docs.snowflake.com/en/release-notes/bcr-bundles/2025_07_bundle
+        # here we prematurely make the change to default string to
+        # 1. align the string length with UDTF based ingestion
+        # 2. avoid the BCR impact to dbapi feature
+        for field in generated_schema.fields:
+            if isinstance(field.datatype, StringType) and field.datatype.length is None:
+                field.datatype.length = MAX_STRING_SIZE
+        return generated_schema
 
     def infer_schema_from_description_with_error_control(
         self, table_or_query: str, is_query: bool, query_input_alias: str
@@ -184,7 +195,10 @@ def udtf_ingestion(
             select * from {partition_table}, table({udtf_name}({PARTITION_TABLE_COLUMN_NAME}))
             """
         res = session.sql(call_udtf_sql, _emit_ast=_emit_ast)
-        return self.to_result_snowpark_df_udtf(res, schema, _emit_ast=_emit_ast)
+        return BaseDriver.keep_nullable_attributes(
+            self.to_result_snowpark_df_udtf(res, schema, _emit_ast=_emit_ast),
+            schema,
+        )
 
     def udtf_class_builder(
         self,
@@ -284,6 +298,14 @@ def to_result_snowpark_df(
     ) -> "DataFrame":
         return session.table(table_name, _emit_ast=_emit_ast)
 
+    @staticmethod
+    def keep_nullable_attributes(
+        selected_df: "DataFrame", schema: StructType
+    ) -> "DataFrame":
+        for attr, source_field in zip(selected_df._plan.attributes, schema.fields):
+            attr.nullable = source_field.nullable
+        return selected_df
+
     @staticmethod
     def to_result_snowpark_df_udtf(
         res_df: "DataFrame",
@@ -294,10 +316,7 @@ def to_result_snowpark_df_udtf(
             res_df[field.name].cast(field.datatype).alias(field.name)
             for field in schema.fields
         ]
-        selected_df = res_df.select(cols, _emit_ast=_emit_ast)
-        for attr, source_field in zip(selected_df._plan.attributes, schema.fields):
-            attr.nullable = source_field.nullable
-        return selected_df
+        return res_df.select(cols, _emit_ast=_emit_ast)
 
     def get_server_cursor_if_supported(self, conn: "Connection") -> "Cursor":
         """
diff --git a/src/snowflake/snowpark/_internal/server_connection.py b/src/snowflake/snowpark/_internal/server_connection.py
@@ -86,6 +86,7 @@
 PARAM_INTERNAL_APPLICATION_NAME = "internal_application_name"
 PARAM_INTERNAL_APPLICATION_VERSION = "internal_application_version"
 DEFAULT_STRING_SIZE = 16777216
+MAX_STRING_SIZE = 134217728
 
 
 def _build_target_path(stage_location: str, dest_prefix: str = "") -> str:
diff --git a/src/snowflake/snowpark/dataframe_reader.py b/src/snowflake/snowpark/dataframe_reader.py
@@ -1707,18 +1707,24 @@ def dbapi(
         Reads data from a database table or query into a DataFrame using a DBAPI connection,
         with support for optional partitioning, parallel processing, and query customization.
 
-        There are multiple methods to partition data and accelerate ingestion.
-        These methods can be combined to achieve optimal performance:
-
-        1.Use column, lower_bound, upper_bound and num_partitions at the same time when you need to split large tables into smaller partitions for parallel processing.
-        These must all be specified together, otherwise error will be raised.
-        2.Set max_workers to a proper positive integer.
-        This defines the maximum number of processes and threads used for parallel execution.
-        3.Adjusting fetch_size can optimize performance by reducing the number of round trips to the database.
-        4.Use predicates to defining WHERE conditions for partitions,
-        predicates will be ignored if column is specified to generate partition.
-        5.Set custom_schema to avoid snowpark infer schema, custom_schema must have a matched
-        column name with table in external data source.
+        Usage Notes:
+            - Ingestion performance tuning:
+                - **Partitioning**: Use ``column``, ``lower_bound``, ``upper_bound``, and ``num_partitions``
+                  together to split large tables into smaller partitions for parallel processing.
+                  All four parameters must be specified together, otherwise an error will be raised.
+                - **Parallel execution**: Set ``max_workers`` to control the maximum number of processes
+                  and threads used for parallel execution.
+                - **Fetch optimization**: Adjust ``fetch_size`` to optimize performance by reducing
+                  the number of round trips to the database.
+                - **Partition filtering**: Use ``predicates`` to define WHERE conditions for partitions.
+                  Note that ``predicates`` will be ignored if ``column`` is specified for partitioning.
+                - **Schema specification**: Set ``custom_schema`` to skip schema inference. The custom schema
+                  must have matching column names with the table in the external data source.
+            - Execution timing and error handling:
+                - **UDTF Ingestion**: Uses lazy evaluation. Errors are reported as ``SnowparkSQLException``
+                  during DataFrame actions (e.g., ``DataFrame.collect()``).
+                - **Local Ingestion**: Uses eager execution. Errors are reported immediately as
+                  ``SnowparkDataFrameReaderException`` when this method is called.
 
         Args:
             create_connection: A callable that returns a DB-API compatible database connection.
diff --git a/tests/integ/datasource/test_databricks.py b/tests/integ/datasource/test_databricks.py
@@ -177,9 +177,6 @@ def test_double_quoted_column_databricks(session, custom_schema):
     [("table", TEST_TABLE_NAME), ("query", f"(SELECT * FROM {TEST_TABLE_NAME})")],
 )
 @pytest.mark.udf
-@pytest.mark.skipif(
-    sys.version_info[:2] == (3, 13), reason="driver not supported in python 3.13"
-)
 def test_udtf_ingestion_databricks(session, input_type, input_value, caplog):
     # we define here to avoid test_databricks.py to be pickled and unpickled in UDTF
     def local_create_databricks_connection():
diff --git a/tests/integ/datasource/test_mysql.py b/tests/integ/datasource/test_mysql.py
@@ -3,7 +3,6 @@
 #
 import logging
 import math
-import sys
 from decimal import Decimal
 
 import pytest
@@ -225,9 +224,6 @@ def test_infer_type_from_data(data, number_of_columns, expected_result):
 
 
 @pytest.mark.udf
-@pytest.mark.skipif(
-    sys.version_info[:2] == (3, 13), reason="driver not supported in python 3.13"
-)
 def test_udtf_ingestion_mysql(session, caplog):
     from tests.parameters import MYSQL_CONNECTION_PARAMETERS
 
@@ -251,6 +247,7 @@ def create_connection_mysql():
     ).order_by("ID")
 
     Utils.check_answer(df, mysql_real_data)
+    assert df.schema == mysql_schema
 
     # check that udtf is used
     assert (
diff --git a/tests/integ/datasource/test_oracledb.py b/tests/integ/datasource/test_oracledb.py
@@ -4,7 +4,6 @@
 
 import logging
 import math
-import sys
 from collections import namedtuple
 from unittest.mock import patch
 
@@ -154,9 +153,6 @@ def test_oracledb_driver_coverage(caplog):
 
 
 @pytest.mark.udf
-@pytest.mark.skipif(
-    sys.version_info[:2] == (3, 13), reason="driver not supported in python 3.13"
-)
 def test_udtf_ingestion_oracledb(session):
     from tests.parameters import ORACLEDB_CONNECTION_PARAMETERS
 
@@ -183,6 +179,7 @@ def create_connection_oracledb():
     ).order_by("ID")
 
     Utils.check_answer(df, oracledb_real_data)
+    assert df.schema == oracledb_real_schema
 
     # check that udtf is used
     flag = False
diff --git a/tests/integ/datasource/test_postgres.py b/tests/integ/datasource/test_postgres.py
@@ -1,8 +1,6 @@
 #
 # Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
 #
-import sys
-
 import pytest
 
 from snowflake.snowpark import Row
@@ -174,9 +172,6 @@ def test_unicode_column_name_postgres(session, custom_schema):
     ],
 )
 @pytest.mark.udf
-@pytest.mark.skipif(
-    sys.version_info[:2] == (3, 13), reason="driver not supported in python 3.13"
-)
 def test_udtf_ingestion_postgres(session, input_type, input_value, caplog):
     from tests.parameters import POSTGRES_CONNECTION_PARAMETERS
 
@@ -196,7 +191,7 @@ def create_connection_postgres():
         },
     ).order_by("BIGSERIAL_COL")
 
-    assert df.collect() == EXPECTED_TEST_DATA
+    assert df.collect() == EXPECTED_TEST_DATA and df.schema == postgres_schema
     # assert UDTF creation and UDTF call
     assert (
         "TEMPORARY  FUNCTION  SNOWPARK_TEMP_FUNCTION" "" in caplog.text
diff --git a/tests/integ/datasource/test_sql_server.py b/tests/integ/datasource/test_sql_server.py
@@ -5,7 +5,6 @@
 import pytest
 
 from snowflake.snowpark._internal.data_source.utils import DBMS_TYPE
-from snowflake.snowpark.types import StringType
 
 from tests.parameters import SQL_SERVER_CONNECTION_PARAMETERS
 from tests.utils import IS_IN_STORED_PROC, Utils, IS_WINDOWS, IS_MACOS, RUNNING_ON_GH
@@ -63,30 +62,7 @@ def verify_save_table_result(
         df = df.order_by("ID")
 
     Utils.check_answer(df, expected_data)
-
-    def verify_schemas(df, expected_schema, ignore_string_size):
-        # TODO: SNOW-2362041
-        # - UDTF ingestion returning StringType 128 MB (due to variant default to 128MB)
-        # - parquet based ingestion returning StringType 16 MB
-        # we should align the two
-        for field, expected_field in zip(df.schema.fields, expected_schema.fields):
-            if isinstance(field.datatype, StringType):
-                assert isinstance(field.datatype, type(expected_field.datatype))
-                if ignore_string_size:
-                    assert (
-                        field.datatype.length == expected_field.datatype.length
-                        or field.datatype.length == 134217728
-                    )
-                else:
-                    assert field.datatype.length == expected_field.datatype.length
-            else:
-                assert field.datatype == expected_field.datatype
-            assert field.name == expected_field.name
-            assert field.nullable == expected_field.nullable
-
-    verify_schemas(df, expected_schema, ignore_string_size)
-    # after the fix SNOW-2362041, we should be able to enable this assertion
-    # assert df.schema == expected_schema
+    assert df.schema == expected_schema
 
     table_name = Utils.random_table_name()
     # save and read
@@ -97,9 +73,7 @@ def verify_schemas(df, expected_schema, ignore_string_size):
         read_table = read_table.order_by("ID")
 
     Utils.check_answer(read_table, expected_data)
-    verify_schemas(read_table, expected_schema, ignore_string_size)
-    # after the fix SNOW-2362041, we should be able to enable this assertion
-    # assert read_table.schema == expected_schema
+    assert read_table.schema == expected_schema
 
 
 def create_connection_sql_server():
@@ -365,9 +339,6 @@ def connection_func():
     with pytest.raises(
         SnowparkClientException, match="Must declare the scalar variable"
     ):
-        # TODO: 2362041, UDTF error experience is different from parquet ingestion
-        # 1. UDTF needs .collect() to trigger the error while parquet ingestion triggers on .dbapi()
-        # 2. error exception is different
         session.read.dbapi(connection_func, **dbapi_kwargs).collect()
 
 
diff --git a/tests/resources/test_data_source_dir/test_data_source_data.py b/tests/resources/test_data_source_dir/test_data_source_data.py
@@ -31,6 +31,7 @@
     NullType,
     TimestampTimeZone,
 )
+from snowflake.snowpark._internal.server_connection import MAX_STRING_SIZE
 
 
 # we manually mock these objects because mock object cannot be used in multi-process as they are not pickleable
@@ -95,12 +96,12 @@ def execute(self, sql: str):
         StructField("NUMBER_COL", DecimalType(10, 2), nullable=True),
         StructField("BINARY_FLOAT_COL", DoubleType(), nullable=True),
         StructField("BINARY_DOUBLE_COL", DoubleType(), nullable=True),
-        StructField("VARCHAR2_COL", StringType(16777216), nullable=True),
-        StructField("CHAR_COL", StringType(16777216), nullable=True),
-        StructField("CLOB_COL", StringType(16777216), nullable=True),
-        StructField("NCHAR_COL", StringType(16777216), nullable=True),
-        StructField("NVARCHAR2_COL", StringType(16777216), nullable=True),
-        StructField("NCLOB_COL", StringType(16777216), nullable=True),
+        StructField("VARCHAR2_COL", StringType(MAX_STRING_SIZE), nullable=True),
+        StructField("CHAR_COL", StringType(MAX_STRING_SIZE), nullable=True),
+        StructField("CLOB_COL", StringType(MAX_STRING_SIZE), nullable=True),
+        StructField("NCHAR_COL", StringType(MAX_STRING_SIZE), nullable=True),
+        StructField("NVARCHAR2_COL", StringType(MAX_STRING_SIZE), nullable=True),
+        StructField("NCLOB_COL", StringType(MAX_STRING_SIZE), nullable=True),
         StructField("DATE_COL", DateType(), nullable=True),
         StructField(
             "TIMESTAMP_COL", TimestampType(TimestampTimeZone.NTZ), nullable=True
@@ -131,12 +132,12 @@ def execute(self, sql: str):
         StructField("NUMBER_COL", DecimalType(10, 2), nullable=True),
         StructField("BINARY_FLOAT_COL", DoubleType(), nullable=True),
         StructField("BINARY_DOUBLE_COL", DoubleType(), nullable=True),
-        StructField("VARCHAR2_COL", StringType(16777216), nullable=True),
-        StructField("CHAR_COL", StringType(16777216), nullable=True),
-        StructField("CLOB_COL", StringType(16777216), nullable=True),
-        StructField("NCHAR_COL", StringType(16777216), nullable=True),
-        StructField("NVARCHAR2_COL", StringType(16777216), nullable=True),
-        StructField("NCLOB_COL", StringType(16777216), nullable=True),
+        StructField("VARCHAR2_COL", StringType(MAX_STRING_SIZE), nullable=True),
+        StructField("CHAR_COL", StringType(MAX_STRING_SIZE), nullable=True),
+        StructField("CLOB_COL", StringType(MAX_STRING_SIZE), nullable=True),
+        StructField("NCHAR_COL", StringType(MAX_STRING_SIZE), nullable=True),
+        StructField("NVARCHAR2_COL", StringType(MAX_STRING_SIZE), nullable=True),
+        StructField("NCLOB_COL", StringType(MAX_STRING_SIZE), nullable=True),
         StructField("DATE_COL", DateType(), nullable=True),
         StructField(
             "TIMESTAMP_COL", TimestampType(TimestampTimeZone.NTZ), nullable=True
@@ -156,18 +157,18 @@ def execute(self, sql: str):
 oracledb_unicode_schema = StructType(
     [
         StructField('"編號"', LongType(), nullable=False),
-        StructField('"姓名"', StringType(16777216), nullable=True),
-        StructField('"國家"', StringType(16777216), nullable=True),
-        StructField('"備註"', StringType(16777216), nullable=True),
+        StructField('"姓名"', StringType(MAX_STRING_SIZE), nullable=True),
+        StructField('"國家"', StringType(MAX_STRING_SIZE), nullable=True),
+        StructField('"備註"', StringType(MAX_STRING_SIZE), nullable=True),
     ]
 )
 
 oracledb_double_quoted_schema = StructType(
     [
         StructField("ID", LongType(), nullable=False),
-        StructField("FULLNAME", StringType(16777216), nullable=True),
-        StructField("COUNTRY", StringType(16777216), nullable=True),
-        StructField("NOTES", StringType(16777216), nullable=True),
+        StructField("FULLNAME", StringType(MAX_STRING_SIZE), nullable=True),
+        StructField("COUNTRY", StringType(MAX_STRING_SIZE), nullable=True),
+        StructField("NOTES", StringType(MAX_STRING_SIZE), nullable=True),
     ]
 )
 
diff --git a/tests/resources/test_data_source_dir/test_databricks_data.py b/tests/resources/test_data_source_dir/test_databricks_data.py
diff --git a/tests/resources/test_data_source_dir/test_mysql_data.py b/tests/resources/test_data_source_dir/test_mysql_data.py
diff --git a/tests/resources/test_data_source_dir/test_postgres_data.py b/tests/resources/test_data_source_dir/test_postgres_data.py
diff --git a/tests/resources/test_data_source_dir/test_sql_server_data.py b/tests/resources/test_data_source_dir/test_sql_server_data.py