snowflakedb · sfc-gh-yuwang · Jan 30, 2025 · Jan 23, 2025 · Jan 24, 2025 · Jan 24, 2025
diff --git a/src/snowflake/snowpark/dataframe_reader.py b/src/snowflake/snowpark/dataframe_reader.py
@@ -10,6 +10,7 @@
     wait,
     ALL_COMPLETED,
     ThreadPoolExecutor,
+    as_completed,
 )
 
 from dateutil import parser
@@ -48,7 +49,10 @@
 )
 from snowflake.snowpark.column import METADATA_COLUMN_TYPES, Column, _to_col_if_str
 from snowflake.snowpark.dataframe import DataFrame
-from snowflake.snowpark.exceptions import SnowparkSessionException
+from snowflake.snowpark.exceptions import (
+    SnowparkSessionException,
+    SnowparkClientException,
+)
 from snowflake.snowpark.functions import sql_expr
 from snowflake.snowpark.mock._connection import MockServerConnection
 from snowflake.snowpark.table import Table
@@ -95,6 +99,8 @@
     "TIMESTAMPFORMAT": "TIMESTAMP_FORMAT",
 }
 
+MAX_RETRY_TIME = 3
+
 
 def _validate_stage_path(path: str) -> str:
     stripped_path = path.strip("\"'")
@@ -1016,11 +1022,12 @@ def dbapi(
         lower_bound: Optional[Union[str, int]] = None,
         upper_bound: Optional[Union[str, int]] = None,
         num_partitions: Optional[int] = None,
-        predicates: Optional[List[str]] = None,
         *,
         max_workers: Optional[int] = None,
+        query_timeout: Optional[int] = 0,
     ) -> DataFrame:
         conn = create_connection()
+        conn.timeout = query_timeout
         struct_schema, raw_schema = self._infer_data_source_schema(conn, table)
         if column is None:
             if (
@@ -1056,7 +1063,6 @@ def dbapi(
                 lower_bound,
                 upper_bound,
                 num_partitions,
-                predicates,
             )
         with tempfile.TemporaryDirectory() as tmp_dir:
             # create temp table
@@ -1072,42 +1078,51 @@ def dbapi(
             sql_create_temp_stage = f"create {get_temp_type_for_object(self._session._use_scoped_temp_objects, True)} stage if not exists {snowflake_stage_name}"
             self._session._run_query(sql_create_temp_stage, is_ddl_on_temp_object=True)
 
-            with ProcessPoolExecutor(max_workers=max_workers) as executor:
-                futures = [
-                    executor.submit(
-                        task_fetch_from_data_source,
+            with ProcessPoolExecutor(
+                max_workers=max_workers
+            ) as process_executor, ThreadPoolExecutor(
+                max_workers=max_workers
+            ) as thread_executor:
+                thread_pool_futures = []
+                process_pool_futures = [
+                    process_executor.submit(
+                        task_fetch_from_data_source_with_retry,
                         create_connection,
                         query,
                         raw_schema,
                         i,
                         tmp_dir,
+                        query_timeout,
                     )
                     for i, query in enumerate(partitioned_queries)
                 ]
-
-                completed_futures = wait(futures, return_when=ALL_COMPLETED)
-            files = []
-            for f in completed_futures.done:
-                if isinstance(f.result(), Exception):
-                    raise f.result()
-                else:
-                    files.append(f.result())
-            with ThreadPoolExecutor(max_workers=max_workers) as thread_executor:
-                futures = [
-                    thread_executor.submit(
-                        self.upload_and_copy_into_table,
-                        f,
-                        snowflake_stage_name,
-                        snowflake_table_name,
-                        "abort_statement",
-                    )
-                    for f in files
-                ]
-
-                completed_futures = wait(futures, return_when=ALL_COMPLETED)
-            for f in completed_futures.done:
-                if f.result() is not None:
-                    raise f.result()
+                for future in as_completed(process_pool_futures):
+                    if isinstance(future.result(), Exception):
+                        logger.debug(
+                            "fetch from data source failed, canceling all running tasks"
+                        )
+                        process_executor.shutdown(wait=False)
+                        thread_executor.shutdown(wait=False)
+                        raise future.result()
+                    else:
+                        thread_pool_futures.append(
+                            thread_executor.submit(
+                                self.upload_and_copy_into_table_with_retry,
+                                future.result(),
+                                snowflake_stage_name,
+                                snowflake_table_name,
+                                "abort_statement",
+                            )
+                        )
+                completed_futures = wait(thread_pool_futures, return_when=ALL_COMPLETED)
+                for f in completed_futures.done:
+                    if f.result() is not None and isinstance(f.result(), Exception):
+                        logger.debug(
+                            "upload and copy into table failed, canceling all running tasks"
+                        )
+                        process_executor.shutdown(wait=False)
+                        thread_executor.shutdown(wait=False)
+                        raise f.result()
             return res_df
 
     def _infer_data_source_schema(
@@ -1205,13 +1220,13 @@ def _to_snowpark_type(self, schema: Tuple[tuple]) -> StructType:
             fields.append(field)
         return StructType(fields)
 
-    def upload_and_copy_into_table(
+    def _upload_and_copy_into_table(
         self,
         local_file: str,
         snowflake_stage_name: str,
         snowflake_table_name: Optional[str] = None,
         on_error: Optional[str] = "abort_statement",
-    ) -> Optional[Exception]:
+    ):
         file_name = os.path.basename(local_file)
         put_query = f"put file://{local_file} @{snowflake_stage_name}/ OVERWRITE=TRUE"
         copy_into_table_query = f"""
@@ -1221,28 +1236,83 @@ def upload_and_copy_into_table(
         PURGE=TRUE
         ON_ERROR={on_error}
         """
-        try:
-            self._session.sql(put_query).collect()
-            self._session.sql(copy_into_table_query).collect()
-            return None
-        except Exception as e:
-            return e
+        self._session.sql(put_query).collect()
+        self._session.sql(copy_into_table_query).collect()
+
+    def upload_and_copy_into_table_with_retry(
+        self,
+        local_file: str,
+        snowflake_stage_name: str,
+        snowflake_table_name: str,
+        on_error: Optional[str] = "abort_statement",
+    ) -> Optional[Exception]:
+        retry_count = 0
+        error = None
+        while retry_count < MAX_RETRY_TIME:
+            try:
+                self._upload_and_copy_into_table(
+                    local_file, snowflake_stage_name, snowflake_table_name, on_error
+                )
+                return
+            except Exception as e:
+                error = e
+                retry_count += 1
+                logger.debug(
+                    f"upload and copy into table failed with {error.__repr__()}, retry count: {retry_count}, retrying ..."
+                )
+        error = SnowparkClientException(
+            message=f"failed to load data to snowflake, got {error.__repr__()}"
+        )
+        logger.debug(
+            f"upload and copy into table failed with {error.__repr__()}, exceed max retry time"
+        )
+        return error
 
 
-def task_fetch_from_data_source(
+def _task_fetch_from_data_source(
     create_connection: Callable[[], "Connection"],
     query: str,
     schema: tuple[tuple[str, Any, int, int, int, int, bool]],
     i: int,
     tmp_dir: str,
-) -> Union[str, Exception]:
-    try:
-        conn = create_connection()
-        result = conn.cursor().execute(query).fetchall()
-        columns = [col[0] for col in schema]
-        df = pd.DataFrame.from_records(result, columns=columns)
-        path = os.path.join(tmp_dir, f"data_{i}.parquet")
-        df.to_parquet(path)
-    except Exception as e:
-        return e
+    query_timeout: int = 0,
+) -> str:
+    conn = create_connection()
+    conn.timeout = query_timeout
+    result = conn.cursor().execute(query).fetchall()
+    columns = [col[0] for col in schema]
+    df = pd.DataFrame.from_records(result, columns=columns)
+    path = os.path.join(tmp_dir, f"data_{i}.parquet")
+    df.to_parquet(path)
     return path
+
+
+def task_fetch_from_data_source_with_retry(
+    create_connection: Callable[[], "Connection"],
+    query: str,
+    schema: tuple[tuple[str, Any, int, int, int, int, bool]],
+    i: int,
+    tmp_dir: str,
+    query_timeout: int = 0,
+) -> Union[str, Exception]:
+    retry_count = 0
+    error = None
+    while retry_count < MAX_RETRY_TIME:
+        try:
+            path = _task_fetch_from_data_source(
+                create_connection, query, schema, i, tmp_dir, query_timeout
+            )
+            return path
+        except Exception as e:
+            error = e
+            retry_count += 1
+            logger.debug(
+                f"fetch from data source failed with {error.__repr__()}, retry count: {retry_count}, retrying ..."
+            )
+    error = SnowparkClientException(
+        message=f"failed to fetch from data source, got {error.__repr__()}"
+    )
+    logger.debug(
+        f"fetch from data source failed with {error.__repr__()}, exceed max retry time"
+    )
+    return error
diff --git a/tests/integ/test_data_source_api.py b/tests/integ/test_data_source_api.py
@@ -3,11 +3,18 @@
 #
 
 import decimal
+import time
 from _decimal import Decimal
 import datetime
+from unittest import mock
 from unittest.mock import MagicMock
 import pytest
 
+from snowflake.snowpark.dataframe_reader import (
+    task_fetch_from_data_source_with_retry,
+    MAX_RETRY_TIME,
+)
+
 SQL_SERVER_TABLE_NAME = "RandomDataWith100Columns"
 
 
@@ -670,6 +677,7 @@
 mock_conn.cursor.return_value = mock_cursor
 
 
+# we manually mock these objects because mock object cannot be used in multi-process as they are not pickleable
 class FakeConnection:
     def cursor(self):
         return self
@@ -687,10 +695,96 @@ def create_connection():
     return FakeConnection()
 
 
+def fake_task_fetch_from_data_source_with_retry(
+    create_connection,
+    query,
+    schema,
+    i,
+    tmp_dir,
+):
+    time.sleep(2)
+
+
+def upload_and_copy_into_table_with_retry(
+    self,
+    local_file,
+    snowflake_stage_name,
+    snowflake_table_name,
+    on_error,
+):
+    time.sleep(2)
+
+
 @pytest.mark.skipif(
     "config.getoption('local_testing_mode', default=False)",
     reason="feature not available in local testing",
 )
 def test_dbapi_with_temp_table(session):
     df = session.read.dbapi(create_connection, SQL_SERVER_TABLE_NAME, max_workers=4)
     assert df.collect() == rows
+
+
+@pytest.mark.skipif(
+    "config.getoption('local_testing_mode', default=False)",
+    reason="feature not available in local testing",
+)
+def test_dbapi_retry(session):
+
+    with mock.patch(
+        "snowflake.snowpark.dataframe_reader._task_fetch_from_data_source",
+        side_effect=Exception("Test error"),
+    ) as mock_task:
+        result = task_fetch_from_data_source_with_retry(
+            create_connection=create_connection,
+            query="SELECT * FROM test_table",
+            schema=(("col1", int, 0, 0, 0, 0, False),),
+            i=0,
+            tmp_dir="/tmp",
+        )
+        assert mock_task.call_count == MAX_RETRY_TIME
+        assert isinstance(result, Exception)
+
+    with mock.patch(
+        "snowflake.snowpark.dataframe_reader.DataFrameReader._upload_and_copy_into_table",
+        side_effect=Exception("Test error"),
+    ) as mock_task:
+        result = session.read.upload_and_copy_into_table_with_retry(
+            local_file="fake_file",
+            snowflake_stage_name="fake_stage",
+            snowflake_table_name="fake_table",
+        )
+        assert mock_task.call_count == MAX_RETRY_TIME
+        assert isinstance(result, Exception)
+
+
+@pytest.mark.skipif(
+    "config.getoption('local_testing_mode', default=False)",
+    reason="feature not available in local testing",
+)
+def test_parallel(session):
+    num_partitions = 3
+    # this test meant to test whether ingest is fully parallelized
+    # we cannot mock this function as process pool does not all mock object
+    with mock.patch(
+        "snowflake.snowpark.dataframe_reader.task_fetch_from_data_source_with_retry",
+        new=fake_task_fetch_from_data_source_with_retry,
+    ):
+        with mock.patch(
+            "snowflake.snowpark.dataframe_reader.DataFrameReader.upload_and_copy_into_table_with_retry",
+            wrap=upload_and_copy_into_table_with_retry,
+        ) as mock_upload_and_copy:
+            start = time.time()
+            session.read.dbapi(
+                create_connection,
+                SQL_SERVER_TABLE_NAME,
+                column="ID",
+                upper_bound=100,
+                lower_bound=0,
+                num_partitions=num_partitions,
+                max_workers=4,
+            )
+            end = time.time()
+            # totally time without parallel is 12 seconds
+            assert end - start < 6
+            # verify that mocked function is called for each partition
+            assert mock_upload_and_copy.call_count == num_partitions