SNOW-1947479 Add bulk_upload_chunks parameter to write_pandas (#2322)

sfc-gh-mkubik · web-flow · commit c53aad7b4ec4 · 2025-06-06T13:59:32.000+02:00
diff --git a/DESCRIPTION.md b/DESCRIPTION.md
@@ -10,6 +10,8 @@ Source code is also available at: https://github.com/snowflakedb/snowflake-conne
 - v3.16(TBD)
   - Bumped numpy dependency from <2.1.0 to <=2.2.4
   - Added Windows support for Python 3.13.
+  - Add `bulk_upload_chunks` parameter to `write_pandas` function. Setting this parameter to True changes the behaviour of write_pandas function to first write all the data chunks to the local disk and then perform the wildcard upload of the chunks folder to the stage. In default behaviour the chunks are being saved, uploaded and deleted one by one.
+
 
 - v3.15.1(May 20, 2025)
   - Added basic arrow support for Interval types.
diff --git a/src/snowflake/connector/pandas_tools.py b/src/snowflake/connector/pandas_tools.py
@@ -260,6 +260,7 @@ def write_pandas(
     table_type: Literal["", "temp", "temporary", "transient"] = "",
     use_logical_type: bool | None = None,
     iceberg_config: dict[str, str] | None = None,
+    bulk_upload_chunks: bool = False,
     **kwargs: Any,
 ) -> tuple[
     bool,
@@ -331,6 +332,8 @@ def write_pandas(
                 * base_location: the base directory that snowflake can write iceberg metadata and files to
                 * catalog_sync: optionally sets the catalog integration configured for Polaris Catalog
                 * storage_serialization_policy: specifies the storage serialization policy for the table
+        bulk_upload_chunks: If set to True, the upload will use the wildcard upload method.
+            This is a faster method of uploading but instead of uploading and cleaning up each chunk separately it will upload all chunks at once and then clean up locally stored chunks.
 
 
 
@@ -437,17 +440,27 @@ def write_pandas(
             chunk_path = os.path.join(tmp_folder, f"file{i}.txt")
             # Dump chunk into parquet file
             chunk.to_parquet(chunk_path, compression=compression, **kwargs)
-            # Upload parquet file
-            path = chunk_path.replace("\\", "\\\\").replace("'", "\\'")
+            if not bulk_upload_chunks:
+                # Upload parquet file chunk right away
+                path = chunk_path.replace("\\", "\\\\").replace("'", "\\'")
+                cursor._upload(
+                    local_file_name=f"'file://{path}'",
+                    stage_location="@" + stage_location,
+                    options={"parallel": parallel, "source_compression": "auto_detect"},
+                )
+
+                # Remove chunk file
+                os.remove(chunk_path)
+
+        if bulk_upload_chunks:
+            # Upload tmp directory with parquet chunks
+            path = tmp_folder.replace("\\", "\\\\").replace("'", "\\'")
             cursor._upload(
-                local_file_name=f"'file://{path}'",
+                local_file_name=f"'file://{path}/*'",
                 stage_location="@" + stage_location,
                 options={"parallel": parallel, "source_compression": "auto_detect"},
             )
 
-            # Remove chunk file
-            os.remove(chunk_path)
-
     # in Snowflake, all parquet data is stored in a single column, $1, so we must select columns explicitly
     # see (https://docs.snowflake.com/en/user-guide/script-data-load-transform-parquet.html)
     if quote_identifiers:
diff --git a/test/integ/pandas/test_pandas_tools.py b/test/integ/pandas/test_pandas_tools.py
@@ -1139,3 +1139,49 @@ def test_pandas_with_single_quote(
             )
         finally:
             cnx.execute_string(f"drop table if exists {table_name}")
+
+
+@pytest.mark.parametrize("bulk_upload_chunks", [True, False])
+def test_write_pandas_bulk_chunks_upload(conn_cnx, bulk_upload_chunks):
+    """Tests whether overwriting table using a Pandas DataFrame works as expected."""
+    random_table_name = random_string(5, "userspoints_")
+    df_data = [("Dash", 50), ("Luke", 20), ("Mark", 10), ("John", 30)]
+    df = pandas.DataFrame(df_data, columns=["name", "points"])
+
+    table_name = random_table_name
+    col_id = "id"
+    col_name = "name"
+    col_points = "points"
+
+    create_sql = (
+        f"CREATE OR REPLACE TABLE {table_name}"
+        f"({col_name} STRING, {col_points} INT, {col_id} INT AUTOINCREMENT)"
+    )
+
+    select_count_sql = f"SELECT count(*) FROM {table_name}"
+    drop_sql = f"DROP TABLE IF EXISTS {table_name}"
+    with conn_cnx() as cnx:  # type: SnowflakeConnection
+        cnx.execute_string(create_sql)
+        try:
+            # Write dataframe with 1 row
+            success, nchunks, nrows, _ = write_pandas(
+                cnx,
+                df,
+                random_table_name,
+                quote_identifiers=False,
+                auto_create_table=False,
+                overwrite=True,
+                index=True,
+                on_error="continue",
+                chunk_size=1,
+                bulk_upload_chunks=bulk_upload_chunks,
+            )
+            # Check write_pandas output
+            assert success
+            assert nchunks == 4
+            assert nrows == 4
+            result = cnx.cursor(DictCursor).execute(select_count_sql).fetchone()
+            # Check number of rows
+            assert result["COUNT(*)"] == 4
+        finally:
+            cnx.execute_string(drop_sql)