SNOW-2097818: use multithread as the default implementation for dbapi (#3491)

sfc-gh-aling · web-flow · commit 92e71628cb2f · 2025-07-01T19:29:43.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,8 @@
 - Added debuggability improvements to eagerly validate dataframe schema metadata. Enable it using `snowflake.snowpark.context.configure_development_features()`.
 - Added a new function `snowflake.snowpark.dataframe.map_in_pandas` that allows users map a function across a dataframe. The mapping function takes an iterator of pandas dataframes as input and provides one as output.
 - Added a ttl cache to describe queries. Repeated queries in a 15 second interval will use the cached value rather than requery Snowflake.
+- Added a parameter `fetch_with_process` to `DataFrameReader.dbapi` (PrPr) to enable multiprocessing for parallel data fetching in
+local ingestion. By default, local ingestion uses multithreading. Multiprocessing may improve performance for CPU-bound tasks like Parquet file generation.
 
 #### Improvements
 
diff --git a/src/snowflake/snowpark/_internal/data_source/utils.py b/src/snowflake/snowpark/_internal/data_source/utils.py
@@ -5,12 +5,13 @@
 import queue
 import time
 import traceback
+import threading
 import multiprocessing as mp
 from concurrent.futures import ThreadPoolExecutor
 from threading import BoundedSemaphore
 from io import BytesIO
 from enum import Enum
-from typing import Any, Tuple, Optional, Callable, Dict
+from typing import Any, Tuple, Optional, Callable, Dict, Union
 import logging
 from snowflake.snowpark._internal.data_source.dbms_dialects import (
     Sqlite3Dialect,
@@ -151,7 +152,8 @@ def _task_fetch_data_from_source(
     worker: DataSourceReader,
     partition: str,
     partition_idx: int,
-    parquet_queue: mp.Queue,
+    parquet_queue: Union[mp.Queue, queue.Queue],
+    stop_event: threading.Event = None,
 ):
     """
     Fetch data from source and convert to parquet BytesIO objects.
@@ -179,6 +181,16 @@ def convert_to_parquet_bytesio(fetched_data, fetch_idx):
         logger.debug(f"Added parquet BytesIO to queue: {parquet_id}")
 
     for i, result in enumerate(worker.read(partition)):
+        if stop_event and stop_event.is_set():
+            parquet_queue.put(
+                (
+                    PARTITION_TASK_ERROR_SIGNAL,
+                    SnowparkDataframeReaderException(
+                        "Data fetching stopped by thread failure"
+                    ),
+                )
+            )
+            break
         convert_to_parquet_bytesio(result, i)
 
     parquet_queue.put((f"{PARTITION_TASK_COMPLETE_SIGNAL_PREFIX}{partition_idx}", None))
@@ -188,14 +200,16 @@ def _task_fetch_data_from_source_with_retry(
     worker: DataSourceReader,
     partition: str,
     partition_idx: int,
-    parquet_queue: mp.Queue,
+    parquet_queue: Union[mp.Queue, queue.Queue],
+    stop_event: threading.Event = None,
 ):
     _retry_run(
         _task_fetch_data_from_source,
         worker,
         partition,
         partition_idx,
         parquet_queue,
+        stop_event,
     )
 
 
@@ -292,7 +306,12 @@ def _retry_run(func: Callable, *args, **kwargs) -> Any:
 
 
 # DBAPI worker function that processes multiple partitions
-def worker_process(partition_queue: mp.Queue, parquet_queue: mp.Queue, reader):
+def worker_process(
+    partition_queue: Union[mp.Queue, queue.Queue],
+    parquet_queue: Union[mp.Queue, queue.Queue],
+    reader,
+    stop_event: threading.Event = None,
+):
     """Worker process that fetches data from multiple partitions"""
     while True:
         try:
@@ -304,6 +323,7 @@ def worker_process(partition_queue: mp.Queue, parquet_queue: mp.Queue, reader):
                 query,
                 partition_idx,
                 parquet_queue,
+                stop_event,
             )
         except queue.Empty:
             # No more work available, exit gracefully
@@ -340,14 +360,15 @@ def process_completed_futures(thread_futures):
 
 def process_parquet_queue_with_threads(
     session: "snowflake.snowpark.Session",
-    parquet_queue: mp.Queue,
-    processes: list,
+    parquet_queue: Union[mp.Queue, queue.Queue],
+    workers: list,
     total_partitions: int,
     snowflake_stage_name: str,
     snowflake_table_name: str,
     max_workers: int,
     statements_params: Optional[Dict[str, str]] = None,
     on_error: str = "abort_statement",
+    fetch_with_process: bool = False,
 ) -> None:
     """
     Process parquet data from a multiprocessing queue using a thread pool.
@@ -361,7 +382,7 @@ def process_parquet_queue_with_threads(
     Args:
         session: Snowflake session for database operations
         parquet_queue: Multiprocessing queue containing parquet data
-        processes: List of worker processes to monitor
+        workers: List of worker processes or thread futures to monitor
         total_partitions: Total number of partitions expected
         snowflake_stage_name: Name of the Snowflake stage for uploads
         snowflake_table_name: Name of the target Snowflake table
@@ -424,19 +445,44 @@ def process_parquet_queue_with_threads(
 
             except queue.Empty:
                 backpressure_semaphore.release()  # Release semaphore if no data was fetched
-                # Check if any processes have failed
-                for i, process in enumerate(processes):
-                    if not process.is_alive() and process.exitcode != 0:
-                        raise SnowparkDataframeReaderException(
-                            f"Partition {i} data fetching process failed with exit code {process.exitcode}"
-                        )
+                if fetch_with_process:
+                    # Check if any processes have failed
+                    for i, process in enumerate(workers):
+                        if not process.is_alive() and process.exitcode != 0:
+                            raise SnowparkDataframeReaderException(
+                                f"Partition {i} data fetching process failed with exit code {process.exitcode}"
+                            )
+                else:
+                    # Check if any threads have failed
+                    for i, future in enumerate(workers):
+                        if future.done():
+                            try:
+                                future.result()
+                            except BaseException as e:
+                                if isinstance(e, SnowparkDataframeReaderException):
+                                    raise e
+                                raise SnowparkDataframeReaderException(
+                                    f"Partition {i} data fetching thread failed with error: {e}"
+                                )
                 time.sleep(0.1)
                 continue
 
-    # Wait for all processes to complete
-    for idx, process in enumerate(processes):
-        process.join()
-        if process.exitcode != 0:
-            raise SnowparkDataframeReaderException(
-                f"Partition {idx} data fetching process failed with exit code {process.exitcode}"
-            )
+    if fetch_with_process:
+        # Wait for all processes to complete
+        for idx, process in enumerate(workers):
+            process.join()
+            if process.exitcode != 0:
+                raise SnowparkDataframeReaderException(
+                    f"Partition {idx} data fetching process failed with exit code {process.exitcode}"
+                )
+    else:
+        # Wait for all threads to complete
+        for idx, future in enumerate(workers):
+            try:
+                future.result()
+            except BaseException as e:
+                if isinstance(e, SnowparkDataframeReaderException):
+                    raise e
+                raise SnowparkDataframeReaderException(
+                    f"Partition {idx} data fetching thread failed with error: {e}"
+                )
diff --git a/src/snowflake/snowpark/dataframe_reader.py b/src/snowflake/snowpark/dataframe_reader.py
@@ -5,9 +5,11 @@
 import os
 import sys
 import time
-
+import queue
+from concurrent.futures import ThreadPoolExecutor
 from logging import getLogger
 from typing import Any, Dict, List, Literal, Optional, Tuple, Union, Callable
+import threading
 
 import snowflake.snowpark
 import snowflake.snowpark._internal.proto.generated.ast_pb2 as proto
@@ -1283,6 +1285,7 @@ def dbapi(
         session_init_statement: Optional[Union[str, List[str]]] = None,
         udtf_configs: Optional[dict] = None,
         fetch_merge_count: int = 1,
+        fetch_with_process: bool = False,
         _emit_ast: bool = True,
     ) -> DataFrame:
         """
@@ -1356,6 +1359,12 @@ def dbapi(
                 before uploading it. This improves performance by reducing the number of
                 small Parquet files. Defaults to 1, meaning each `fetch_size` batch is written to its own
                 Parquet file and uploaded separately.
+            fetch_with_process: Whether to use multiprocessing for data fetching and Parquet file generation in local ingestion.
+                Default to `False`, which means multithreading is used to fetch data in parallel.
+                Setting this to `True` enables multiprocessing, which may improve performance for CPU-bound tasks
+                like Parquet file generation. When using multiprocessing, guard your script with
+                `if __name__ == "__main__":` and call `multiprocessing.freeze_support()` on Windows if needed.
+                This parameter has no effect in UDFT ingestion.
 
         Example::
             .. code-block:: python
@@ -1366,6 +1375,17 @@ def create_oracledb_connection():
                     return connection
 
                 df = session.read.dbapi(create_oracledb_connection, table=...)
+
+        Example::
+            .. code-block:: python
+
+                import oracledb
+                def create_oracledb_connection():
+                    connection = oracledb.connect(...)
+                    return connection
+
+                if __name__ == "__main__":
+                    df = session.read.dbapi(create_oracledb_connection, table=..., fetch_with_process=True)
         """
         if (not table and not query) or (table and query):
             raise SnowparkDataframeReaderException(
@@ -1444,59 +1464,91 @@ def create_oracledb_connection():
             statement_params=statements_params_for_telemetry, _emit_ast=False
         )
 
+        data_fetching_thread_pool_executor = None
+        data_fetching_thread_stop_event = None
+        workers = []
         try:
-            processes = []
-
-            # Determine the number of processes to use
-            max_workers = max_workers or mp.cpu_count()
-
+            # Determine the number of processes or threads to use
+            max_workers = max_workers or os.cpu_count()
+            queue_class = mp.Queue if fetch_with_process else queue.Queue
             # a queue of partitions to be processed, this is filled by the partitioner before starting the workers
-            partition_queue = mp.Queue()
+            partition_queue = queue_class()
             # a queue of parquet BytesIO objects to be uploaded
             # Set max size for parquet_queue to prevent overfilling when thread consumers are slower than process producers
             # process workers will block on this queue if it's full until the upload threads consume the BytesIO objects
-            parquet_queue = mp.Queue(_MAX_WORKER_SCALE * max_workers)
+            parquet_queue = queue_class(_MAX_WORKER_SCALE * max_workers)
             for partition_idx, query in enumerate(partitioned_queries):
                 partition_queue.put((partition_idx, query))
 
             # Start worker processes
             logger.debug(
                 f"Starting {max_workers} worker processes to fetch data from the data source."
             )
-            for _worker_id in range(max_workers):
-                process = mp.Process(
-                    target=worker_process,
-                    args=(partition_queue, parquet_queue, partitioner.reader()),
+
+            if fetch_with_process:
+                for _worker_id in range(max_workers):
+                    process = mp.Process(
+                        target=worker_process,
+                        args=(partition_queue, parquet_queue, partitioner.reader()),
+                    )
+                    process.start()
+                    workers.append(process)
+            else:
+                data_fetching_thread_pool_executor = ThreadPoolExecutor(
+                    max_workers=max_workers
                 )
-                process.start()
-                processes.append(process)
+                data_fetching_thread_stop_event = threading.Event()
+                workers = [
+                    data_fetching_thread_pool_executor.submit(
+                        worker_process,
+                        partition_queue,
+                        parquet_queue,
+                        partitioner.reader(),
+                        data_fetching_thread_stop_event,
+                    )
+                    for _worker_id in range(max_workers)
+                ]
 
             # Process BytesIO objects from queue and upload them using utility method
             process_parquet_queue_with_threads(
                 session=self._session,
                 parquet_queue=parquet_queue,
-                processes=processes,
+                workers=workers,
                 total_partitions=len(partitioned_queries),
                 snowflake_stage_name=snowflake_stage_name,
                 snowflake_table_name=snowflake_table_name,
                 max_workers=max_workers,
                 statements_params=statements_params_for_telemetry,
                 on_error="abort_statement",
+                fetch_with_process=fetch_with_process,
             )
 
         except BaseException as exc:
-            # Graceful shutdown - terminate all processes
-            for process in processes:
-                if process.is_alive():
-                    process.terminate()
-                    process.join(timeout=5)
+            if fetch_with_process:
+                # Graceful shutdown - terminate all processes
+                for process in workers:
+                    if process.is_alive():
+                        process.terminate()
+                        process.join(timeout=5)
+            else:
+                if data_fetching_thread_stop_event:
+                    data_fetching_thread_stop_event.set()
+                for future in workers:
+                    if not future.done():
+                        future.cancel()
+                        logger.debug(
+                            f"Cancelled a remaining data fetching future {future} due to error in another thread."
+                        )
 
             if isinstance(exc, SnowparkDataframeReaderException):
                 raise exc
 
             raise SnowparkDataframeReaderException(
                 f"Error occurred while ingesting data from the data source: {exc!r}"
             )
+        finally:
+            if data_fetching_thread_pool_executor:
+                data_fetching_thread_pool_executor.shutdown(wait=True)
 
         logger.debug("All data has been successfully loaded into the Snowflake table.")
         self._session._conn._telemetry_client.send_data_source_perf_telemetry(
diff --git a/tests/integ/test_data_source_api.py b/tests/integ/test_data_source_api.py