Skip to content
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion src/snowflake/connector/pandas_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,7 @@ def write_pandas(
table_type: Literal["", "temp", "temporary", "transient"] = "",
use_logical_type: bool | None = None,
iceberg_config: dict[str, str] | None = None,
use_vectorized_scanner: bool | None = None,
**kwargs: Any,
) -> tuple[
bool,
Expand Down Expand Up @@ -335,7 +336,10 @@ def write_pandas(
* base_location: the base directory that snowflake can write iceberg metadata and files to
* catalog_sync: optionally sets the catalog integration configured for Polaris Catalog
* storage_serialization_policy: specifies the storage serialization policy for the table

use_vectorized_scanner: Boolean that specifies to use a vectorized scanner for loading Parquet files.
Using the vectorized scanner can significantly reduce the latency for loading Parquet files. To enable
Vectorized scanning of Parquet files, set use_vectorized_scanner as True. Set to None to use Snowflakes default.
For more information, see: https://docs.snowflake.com/en/sql-reference/sql/copy-into-table#label-use-vectorized-scanner


Returns:
Expand Down Expand Up @@ -579,6 +583,7 @@ def drop_object(name: str, object_type: str) -> None:
f"COMPRESSION={compression_map[compression]}"
f"{' BINARY_AS_TEXT=FALSE' if auto_create_table or overwrite else ''}"
f"{sql_use_logical_type}"
f"{' USE_VECTORIZED_SCANNER=' + str(use_vectorized_scanner).upper() if use_vectorized_scanner is not None else ''}"
f") "
f"PURGE=TRUE ON_ERROR=?"
)
Expand Down
57 changes: 57 additions & 0 deletions test/unit/test_pandas_tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#
# Copyright (c) 2012-2023 Snowflake Computing Inc. All rights reserved.
#

from typing import Union
from unittest.mock import MagicMock

import pandas as pd
import pytest

from snowflake.connector import pandas_tools

from .mock_utils import mock_connection


@pytest.mark.parametrize(
("use_vectorized_scanner", "expected_file_format"),
[
(None, "FILE_FORMAT=(TYPE=PARQUET COMPRESSION=auto)"),
(
True,
"FILE_FORMAT=(TYPE=PARQUET COMPRESSION=auto USE_VECTORIZED_SCANNER=TRUE)",
),
(
False,
"FILE_FORMAT=(TYPE=PARQUET COMPRESSION=auto USE_VECTORIZED_SCANNER=FALSE)",
),
],
)
def test_write_pandas_use_vectorized_scanner(
use_vectorized_scanner: Union[bool, None], expected_file_format: str
):
# Setup Mocks
df = pd.DataFrame({"col1": [1, 2, 3]})

mock_conn = mock_connection()
mock_cursor = MagicMock()
mock_conn.cursor.return_value = mock_cursor

# Execute Function
pandas_tools.write_pandas(
conn=mock_conn,
df=df,
table_name="test_table",
schema="test_schema",
database="test_database",
use_vectorized_scanner=use_vectorized_scanner,
)

executed_sql_statements = [
call[0][0] for call in mock_cursor.execute.call_args_list
]

assert any(
"COPY INTO" in sql and expected_file_format in sql
for sql in executed_sql_statements
)