SNOW-2250223: add support for use_vectorized_scanner in write_pandas (#2456)

sfc-gh-yuwang · web-flow · commit c85e592185f3 · 2025-08-07T10:01:52.000-07:00
diff --git a/DESCRIPTION.md b/DESCRIPTION.md
@@ -19,6 +19,7 @@ Source code is also available at: https://github.com/snowflakedb/snowflake-conne
   - Added basic json support for Interval types.
   - Moved `OAUTH_TYPE` to `CLIENT_ENVIROMENT`.
   - Fix bug where PAT with external session authenticator was used while `external_session_id` was not provided in `SnowflakeRestful.fetch`
+  - Added support for parameter `use_vectorized_scanner` in function `write_pandas`.
 
 - v3.16.0(July 04,2025)
   - Bumped numpy dependency from <2.1.0 to <=2.2.4.
diff --git a/src/snowflake/connector/pandas_tools.py b/src/snowflake/connector/pandas_tools.py
@@ -261,6 +261,7 @@ def write_pandas(
     use_logical_type: bool | None = None,
     iceberg_config: dict[str, str] | None = None,
     bulk_upload_chunks: bool = False,
+    use_vectorized_scanner: bool = False,
     **kwargs: Any,
 ) -> tuple[
     bool,
@@ -308,6 +309,8 @@ def write_pandas(
         on_error: Action to take when COPY INTO statements fail, default follows documentation at:
             https://docs.snowflake.com/en/sql-reference/sql/copy-into-table.html#copy-options-copyoptions
             (Default value = 'abort_statement').
+        use_vectorized_scanner: Boolean that specifies whether to use a vectorized scanner for loading Parquet files. See details at
+            `copy options <https://docs.snowflake.com/en/sql-reference/sql/copy-into-table.html#copy-options-copyoptions>`_.
         parallel: Number of threads to be used when uploading chunks, default follows documentation at:
             https://docs.snowflake.com/en/sql-reference/sql/put.html#optional-parameters (Default value = 4).
         quote_identifiers: By default, identifiers, specifically database, schema, table and column names
@@ -579,6 +582,7 @@ def drop_object(name: str, object_type: str) -> None:
             f"FROM (SELECT {parquet_columns} FROM '{copy_stage_location}') "
             f"FILE_FORMAT=("
             f"TYPE=PARQUET "
+            f"USE_VECTORIZED_SCANNER={use_vectorized_scanner} "
             f"COMPRESSION={compression_map[compression]}"
             f"{' BINARY_AS_TEXT=FALSE' if auto_create_table or overwrite else ''}"
             f"{sql_use_logical_type}"
diff --git a/test/integ/pandas_it/test_pandas_tools.py b/test/integ/pandas_it/test_pandas_tools.py
@@ -1184,3 +1184,70 @@ def test_write_pandas_bulk_chunks_upload(conn_cnx, bulk_upload_chunks):
             assert result["COUNT(*)"] == 4
         finally:
             cnx.execute_string(drop_sql)
+
+
+@pytest.mark.parametrize(
+    "use_vectorized_scanner",
+    [
+        True,
+        False,
+    ],
+)
+def test_write_pandas_with_use_vectorized_scanner(
+    conn_cnx: Callable[..., Generator[SnowflakeConnection]],
+    use_vectorized_scanner,
+    caplog,
+):
+    """Tests whether overwriting table using a Pandas DataFrame works as expected."""
+    random_table_name = random_string(5, "userspoints_")
+    df_data = [("Dash", 50)]
+    df = pandas.DataFrame(df_data, columns=["name", "points"])
+
+    table_name = random_table_name
+    col_id = "id"
+    col_name = "name"
+    col_points = "points"
+
+    create_sql = (
+        f"CREATE OR REPLACE TABLE {table_name}"
+        f"({col_name} STRING, {col_points} INT, {col_id} INT AUTOINCREMENT)"
+    )
+
+    drop_sql = f"DROP TABLE IF EXISTS {table_name}"
+    with conn_cnx() as cnx:  # type: SnowflakeConnection
+        original_cur = cnx.cursor().execute
+
+        def fake_execute(query, params=None, *args, **kwargs):
+            return original_cur(query, params, *args, **kwargs)
+
+        cnx.execute_string(create_sql)
+        try:
+            with mock.patch(
+                "snowflake.connector.cursor.SnowflakeCursor.execute",
+                side_effect=fake_execute,
+            ) as execute:
+                # Write dataframe with 1 row
+                success, nchunks, nrows, _ = write_pandas(
+                    cnx,
+                    df,
+                    random_table_name,
+                    quote_identifiers=False,
+                    auto_create_table=False,
+                    overwrite=True,
+                    index=True,
+                    use_vectorized_scanner=use_vectorized_scanner,
+                )
+                # Check write_pandas output
+                assert success
+                assert nchunks == 1
+                assert nrows == 1
+
+                for call in execute.call_args_list:
+                    if call.args[0].startswith("COPY"):
+                        assert (
+                            f"USE_VECTORIZED_SCANNER={use_vectorized_scanner}"
+                            in call.args[0]
+                        )
+
+        finally:
+            cnx.execute_string(drop_sql)