SNOW-669650: Fix write_pandas atomicity when overwrite=True (#1291)

sfc-gh-jdu · web-flow · commit 8a12c6490d6c · 2022-10-28T14:35:21.000-07:00
diff --git a/DESCRIPTION.md b/DESCRIPTION.md
@@ -8,6 +8,10 @@ Source code is also available at: https://github.com/snowflakedb/snowflake-conne
 
 # Release Notes
 
+- v2.9.0(Unreleased)
+
+  - Enhanced the atomicity of write_pandas when overwrite is set to True
+
 - v2.8.0(September 27,2022)
 
   - Fixed a bug where rowcount was deleted when the cursor was closed
diff --git a/src/snowflake/connector/pandas_tools.py b/src/snowflake/connector/pandas_tools.py
@@ -6,8 +6,6 @@
 
 import collections.abc
 import os
-import random
-import string
 import warnings
 from functools import partial
 from logging import getLogger
@@ -19,6 +17,7 @@
 from snowflake.connector import ProgrammingError
 from snowflake.connector.options import pandas
 from snowflake.connector.telemetry import TelemetryData, TelemetryField
+from snowflake.connector.util_text import random_string
 
 if TYPE_CHECKING:  # pragma: no cover
     from .connection import SnowflakeConnection
@@ -152,37 +151,21 @@ def write_pandas(
         )
 
     if quote_identifiers:
-        location = (
-            (('"' + database + '".') if database else "")
-            + (('"' + schema + '".') if schema else "")
-            + ('"' + table_name + '"')
+        location = (f'"{database}".' if database else "") + (
+            f'"{schema}".' if schema else ""
         )
     else:
-        location = (
-            (database + "." if database else "")
-            + (schema + "." if schema else "")
-            + (table_name)
+        location = (f"{database}." if database else "") + (
+            f"{schema}." if schema else ""
         )
     if chunk_size is None:
         chunk_size = len(df)
+
     cursor = conn.cursor()
-    stage_name = None  # Forward declaration
-    while True:
-        try:
-            stage_name = "".join(
-                random.choice(string.ascii_lowercase) for _ in range(5)
-            )
-            create_stage_sql = (
-                "create temporary stage /* Python:snowflake.connector.pandas_tools.write_pandas() */ "
-                '"{stage_name}"'
-            ).format(stage_name=stage_name)
-            logger.debug(f"creating stage with '{create_stage_sql}'")
-            cursor.execute(create_stage_sql, _is_internal=True).fetchall()
-            break
-        except ProgrammingError as pe:
-            if pe.msg.endswith("already exists."):
-                continue
-            raise
+    stage_name = random_string()
+    create_stage_sql = f'CREATE TEMP STAGE /* Python:snowflake.connector.pandas_tools.write_pandas() */ "{stage_name}"'
+    logger.debug(f"creating stage with '{create_stage_sql}'")
+    cursor.execute(create_stage_sql, _is_internal=True).fetchall()
 
     with TemporaryDirectory() as tmp_folder:
         for i, chunk in chunk_helper(df, chunk_size):
@@ -202,42 +185,33 @@ def write_pandas(
             cursor.execute(upload_sql, _is_internal=True)
             # Remove chunk file
             os.remove(chunk_path)
+
+    # in Snowflake, all parquet data is stored in a single column, $1, so we must select columns explicitly
+    # see (https://docs.snowflake.com/en/user-guide/script-data-load-transform-parquet.html)
     if quote_identifiers:
+        quote = '"'
         columns = '"' + '","'.join(list(df.columns)) + '"'
+        parquet_columns = "$1:" + ",$1:".join(f'"{c}"' for c in df.columns)
     else:
+        quote = ""
         columns = ",".join(list(df.columns))
+        parquet_columns = "$1:" + ",$1:".join(df.columns)
+
+    def drop_object(name: str, object_type: str) -> None:
+        drop_sql = f"DROP {object_type.upper()} IF EXISTS {name} /* Python:snowflake.connector.pandas_tools.write_pandas() */"
+        logger.debug(f"dropping {object_type} with '{drop_sql}'")
+        cursor.execute(drop_sql, _is_internal=True)
+
+    if auto_create_table or overwrite:
+        file_format_name = random_string()
+        file_format_sql = (
+            f"CREATE TEMP FILE FORMAT {file_format_name} "
+            f"/* Python:snowflake.connector.pandas_tools.write_pandas() */ "
+            f"TYPE=PARQUET COMPRESSION={compression_map[compression]}"
+        )
+        logger.debug(f"creating file format with '{file_format_sql}'")
+        cursor.execute(file_format_sql, _is_internal=True)
 
-    if overwrite:
-        if auto_create_table:
-            drop_table_sql = f"DROP TABLE IF EXISTS {location} /* Python:snowflake.connector.pandas_tools.write_pandas() */ "
-            logger.debug(f"dropping table with '{drop_table_sql}'")
-            cursor.execute(drop_table_sql, _is_internal=True)
-        else:
-            truncate_table_sql = f"TRUNCATE TABLE IF EXISTS {location} /* Python:snowflake.connector.pandas_tools.write_pandas() */ "
-            logger.debug(f"truncating table with '{truncate_table_sql}'")
-            cursor.execute(truncate_table_sql, _is_internal=True)
-
-    if auto_create_table:
-        file_format_name = None
-        while True:
-            try:
-                file_format_name = (
-                    '"'
-                    + "".join(random.choice(string.ascii_lowercase) for _ in range(5))
-                    + '"'
-                )
-                file_format_sql = (
-                    f"CREATE FILE FORMAT {file_format_name} "
-                    f"/* Python:snowflake.connector.pandas_tools.write_pandas() */ "
-                    f"TYPE=PARQUET COMPRESSION={compression_map[compression]}"
-                )
-                logger.debug(f"creating file format with '{file_format_sql}'")
-                cursor.execute(file_format_sql, _is_internal=True)
-                break
-            except ProgrammingError as pe:
-                if pe.msg.endswith("already exists."):
-                    continue
-                raise
         infer_schema_sql = f"SELECT COLUMN_NAME, TYPE FROM table(infer_schema(location=>'@\"{stage_name}\"', file_format=>'{file_format_name}'))"
         logger.debug(f"inferring schema with '{infer_schema_sql}'")
         column_type_mapping = dict(
@@ -246,46 +220,48 @@ def write_pandas(
         # Infer schema can return the columns out of order depending on the chunking we do when uploading
         # so we have to iterate through the dataframe columns to make sure we create the table with its
         # columns in order
-        quote = '"' if quote_identifiers else ""
         create_table_columns = ", ".join(
             [f"{quote}{c}{quote} {column_type_mapping[c]}" for c in df.columns]
         )
+
+        target_table_name = (
+            f"{location}{quote}{random_string() if overwrite else table_name}{quote}"
+        )
         create_table_sql = (
-            f"CREATE {table_type.upper()} TABLE IF NOT EXISTS {location} "
+            f"CREATE {table_type.upper()} TABLE IF NOT EXISTS {target_table_name} "
             f"({create_table_columns})"
             f" /* Python:snowflake.connector.pandas_tools.write_pandas() */ "
         )
         logger.debug(f"auto creating table with '{create_table_sql}'")
         cursor.execute(create_table_sql, _is_internal=True)
-        drop_file_format_sql = f"DROP FILE FORMAT IF EXISTS {file_format_name}"
-        logger.debug(f"dropping file format with '{drop_file_format_sql}'")
-        cursor.execute(drop_file_format_sql, _is_internal=True)
-
-    # in Snowflake, all parquet data is stored in a single column, $1, so we must select columns explicitly
-    # see (https://docs.snowflake.com/en/user-guide/script-data-load-transform-parquet.html)
-    if quote_identifiers:
-        parquet_columns = "$1:" + ",$1:".join(f'"{c}"' for c in df.columns)
     else:
-        parquet_columns = "$1:" + ",$1:".join(df.columns)
+        target_table_name = f"{location}{quote}{table_name}{quote}"
+
+    try:
+        copy_into_sql = (
+            f"COPY INTO {target_table_name} /* Python:snowflake.connector.pandas_tools.write_pandas() */ "
+            f"({columns}) "
+            f'FROM (SELECT {parquet_columns} FROM @"{stage_name}") '
+            f"FILE_FORMAT=(TYPE=PARQUET COMPRESSION={compression_map[compression]}) "
+            f"PURGE=TRUE ON_ERROR={on_error}"
+        )
+        logger.debug(f"copying into with '{copy_into_sql}'")
+        copy_results = cursor.execute(copy_into_sql, _is_internal=True).fetchall()
+
+        if overwrite:
+            original_table_name = f"{location}{quote}{table_name}{quote}"
+            drop_object(original_table_name, "table")
+            rename_table_sql = f"ALTER TABLE {target_table_name} RENAME TO {original_table_name} /* Python:snowflake.connector.pandas_tools.write_pandas() */"
+            logger.debug(f"rename table with '{rename_table_sql}'")
+            cursor.execute(rename_table_sql, _is_internal=True)
+    except ProgrammingError:
+        if overwrite:
+            drop_object(target_table_name, "table")
+        raise
+    finally:
+        cursor._log_telemetry_job_data(TelemetryField.PANDAS_WRITE, TelemetryData.TRUE)
+        cursor.close()
 
-    copy_into_sql = (
-        "COPY INTO {location} /* Python:snowflake.connector.pandas_tools.write_pandas() */ "
-        "({columns}) "
-        'FROM (SELECT {parquet_columns} FROM @"{stage_name}") '
-        "FILE_FORMAT=(TYPE=PARQUET COMPRESSION={compression}) "
-        "PURGE=TRUE ON_ERROR={on_error}"
-    ).format(
-        location=location,
-        columns=columns,
-        parquet_columns=parquet_columns,
-        stage_name=stage_name,
-        compression=compression_map[compression],
-        on_error=on_error,
-    )
-    logger.debug(f"copying into with '{copy_into_sql}'")
-    copy_results = cursor.execute(copy_into_sql, _is_internal=True).fetchall()
-    cursor._log_telemetry_job_data(TelemetryField.PANDAS_WRITE, TelemetryData.TRUE)
-    cursor.close()
     return (
         all(e[1] == "LOADED" for e in copy_results),
         len(copy_results),
diff --git a/src/snowflake/connector/util_text.py b/src/snowflake/connector/util_text.py
@@ -6,8 +6,11 @@
 from __future__ import annotations
 
 import logging
+import random
 import re
+import string
 from io import StringIO
+from typing import Sequence
 
 COMMENT_PATTERN_RE = re.compile(r"^\s*\-\-")
 EMPTY_LINE_RE = re.compile(r"^\s*$")
@@ -254,3 +257,21 @@ def parse_account(account):
         parsed_account = account
 
     return parsed_account
+
+
+def random_string(
+    length: int = 10,
+    prefix: str = "",
+    suffix: str = "",
+    choices: Sequence[str] = string.ascii_lowercase,
+) -> str:
+    """Our convenience function to generate random string for object names.
+
+    Args:
+        length: How many random characters to choose from choices.
+        prefix: Prefix to add to random string generated.
+        suffix: Suffix to add to random string generated.
+        choices: A generator of things to choose from.
+    """
+    random_part = "".join([random.choice(choices) for _ in range(length)])
+    return "".join([prefix, random_part, suffix])
diff --git a/test/integ/pandas/test_pandas_tools.py b/test/integ/pandas/test_pandas_tools.py
@@ -13,9 +13,10 @@
 import pytest
 
 from snowflake.connector import DictCursor
+from snowflake.connector.errors import ProgrammingError
+from snowflake.connector.util_text import random_string
 
 from ...lazy_var import LazyVar
-from ...randomize import random_string
 
 try:
     from snowflake.connector.options import pandas
@@ -60,6 +61,8 @@ def test_write_pandas_with_overwrite(
     df2 = pandas.DataFrame(df2_data, columns=["name", "points"])
     df3_data = [(2022, "Jan", 10000), (2022, "Feb", 10220)]
     df3 = pandas.DataFrame(df3_data, columns=["year", "month", "revenue"])
+    df4_data = [("Frank", 100)]
+    df4 = pandas.DataFrame(df4_data, columns=["name%", "points"])
 
     if quote_identifiers:
         table_name = '"' + random_table_name + '"'
@@ -133,6 +136,27 @@ def test_write_pandas_with_overwrite(
                     else "YEAR" in [col.name for col in result[0].description]
                 )
 
+            if not quote_identifiers:
+                original_result = (
+                    cnx.cursor(DictCursor).execute(select_count_sql).fetchone()
+                )
+                # the column name contains special char which should fail
+                with pytest.raises(ProgrammingError, match="unexpected '%'"):
+                    write_pandas(
+                        cnx,
+                        df4,
+                        random_table_name,
+                        quote_identifiers=quote_identifiers,
+                        auto_create_table=auto_create_table,
+                        overwrite=True,
+                        index=index,
+                    )
+                # the original table shouldn't have any change
+                assert (
+                    original_result
+                    == cnx.cursor(DictCursor).execute(select_count_sql).fetchone()
+                )
+
         finally:
             cnx.execute_string(drop_sql)
 
diff --git a/test/integ/test_bindings.py b/test/integ/test_bindings.py
@@ -20,8 +20,7 @@
 
 from snowflake.connector.converter import convert_datetime_to_epoch
 from snowflake.connector.errors import ForbiddenError, ProgrammingError
-
-from ..randomize import random_string
+from snowflake.connector.util_text import random_string
 
 tempfile.gettempdir()
 
diff --git a/test/integ/test_cursor.py b/test/integ/test_cursor.py
@@ -51,8 +51,7 @@ class ResultMetadata(NamedTuple):
 )
 from snowflake.connector.sqlstate import SQLSTATE_FEATURE_NOT_SUPPORTED
 from snowflake.connector.telemetry import TelemetryField
-
-from ..randomize import random_string
+from snowflake.connector.util_text import random_string
 
 try:
     from snowflake.connector.constants import (
diff --git a/test/integ/test_dataintegrity.py b/test/integ/test_dataintegrity.py
@@ -18,8 +18,7 @@
 import pytz
 
 from snowflake.connector.dbapi import DateFromTicks, TimeFromTicks, TimestampFromTicks
-
-from ..randomize import random_string
+from snowflake.connector.util_text import random_string
 
 
 def table_exists(conn_cnx, name):
diff --git a/test/integ/test_dbapi.py b/test/integ/test_dbapi.py
@@ -17,8 +17,7 @@
 import snowflake.connector
 import snowflake.connector.dbapi
 from snowflake.connector import dbapi, errorcode, errors
-
-from ..randomize import random_string
+from snowflake.connector.util_text import random_string
 
 TABLE1 = "dbapi_ddl1"
 TABLE2 = "dbapi_ddl2"
diff --git a/test/integ/test_put_get.py b/test/integ/test_put_get.py
@@ -17,10 +17,10 @@
 import pytest
 
 from snowflake.connector import OperationalError
+from snowflake.connector.util_text import random_string
 
 from ..generate_test_files import generate_k_lines_of_n_files
 from ..integ_helpers import put
-from ..randomize import random_string
 
 if TYPE_CHECKING:
     from snowflake.connector import SnowflakeConnection
diff --git a/test/integ/test_put_get_compress_enc.py b/test/integ/test_put_get_compress_enc.py
@@ -11,8 +11,9 @@
 
 import pytest
 
+from snowflake.connector.util_text import random_string
+
 from ..integ_helpers import put
-from ..randomize import random_string
 
 pytestmark = pytest.mark.skipolddriver  # old test driver tests won't run this module
 
diff --git a/test/integ/test_put_get_medium.py b/test/integ/test_put_get_medium.py
@@ -23,10 +23,10 @@
     SnowflakeProgressPercentage,
     SnowflakeS3ProgressPercentage,
 )
+from snowflake.connector.util_text import random_string
 
 from ..generate_test_files import generate_k_lines_of_n_files
 from ..integ_helpers import put
-from ..randomize import random_string
 
 if TYPE_CHECKING:
     from snowflake.connector import SnowflakeConnection
diff --git a/test/integ/test_put_get_user_stage.py b/test/integ/test_put_get_user_stage.py
@@ -15,10 +15,10 @@
 import pytest
 
 from snowflake.connector.cursor import SnowflakeCursor
+from snowflake.connector.util_text import random_string
 
 from ..generate_test_files import generate_k_lines_of_n_files
 from ..integ_helpers import put
-from ..randomize import random_string
 
 
 @pytest.mark.aws
diff --git a/test/integ/test_put_get_with_aws_token.py b/test/integ/test_put_get_with_aws_token.py
@@ -28,8 +28,9 @@
 except ImportError:
     pass
 
+from snowflake.connector.util_text import random_string
+
 from ..integ_helpers import put
-from ..randomize import random_string
 
 # Mark every test in this module as an aws test
 pytestmark = pytest.mark.aws
diff --git a/test/integ/test_put_get_with_azure_token.py b/test/integ/test_put_get_with_azure_token.py
diff --git a/test/integ/test_put_get_with_gcp_account.py b/test/integ/test_put_get_with_gcp_account.py
diff --git a/test/randomize.py b/test/randomize.py
diff --git a/test/unit/test_gcs_client.py b/test/unit/test_gcs_client.py

Original file line number	Diff line number	Diff line change
`@@ -51,8 +51,7 @@ class ResultMetadata(NamedTuple):`
`51`	`51`	`)`
`52`	`52`	`from snowflake.connector.sqlstate import SQLSTATE_FEATURE_NOT_SUPPORTED`
`53`	`53`	`from snowflake.connector.telemetry import TelemetryField`
`54`		`-`
`55`		`-from ..randomize import random_string`
	`54`	`+from snowflake.connector.util_text import random_string`
`56`	`55`
`57`	`56`	`try:`
`58`	`57`	`from snowflake.connector.constants import (`