SNOW-1975364: use file for cross process communication (#3149)

sfc-gh-aling · sfc-gh-aling · commit 6e4fae3841a5 · 2025-03-11T20:14:18.000-07:00
diff --git a/src/snowflake/snowpark/_internal/data_source_utils.py b/src/snowflake/snowpark/_internal/data_source_utils.py
@@ -5,8 +5,10 @@
 import datetime
 import decimal
 import logging
+import os
+import queue
 from enum import Enum
-from typing import List, Any, Tuple, Protocol, Optional
+from typing import List, Any, Tuple, Protocol, Optional, Set
 from snowflake.connector.options import pandas as pd
 
 from snowflake.snowpark._internal.utils import get_sorted_key_for_version
@@ -368,3 +370,15 @@ def output_type_handler(cursor, metadata):
         return cursor.var(oracledb.DB_TYPE_LONG, arraysize=cursor.arraysize)
     elif metadata.type_code == oracledb.DB_TYPE_BLOB:
         return cursor.var(oracledb.DB_TYPE_RAW, arraysize=cursor.arraysize)
+
+
+def add_unseen_files_to_process_queue(
+    work_dir: str, set_of_files_already_added_in_queue: Set[str], queue: queue.Queue
+):
+    """Add unseen files in the work_dir to the queue for processing."""
+    # all files in the work_dir are parquet files, no subdirectory
+    all_files = set(os.listdir(work_dir))
+    unseen = all_files - set_of_files_already_added_in_queue
+    for file in unseen:
+        queue.put(os.path.join(work_dir, file))
+        set_of_files_already_added_in_queue.add(file)
diff --git a/src/snowflake/snowpark/dataframe_reader.py b/src/snowflake/snowpark/dataframe_reader.py
@@ -4,9 +4,8 @@
 import datetime
 import decimal
 import functools
-import multiprocessing as mp
+import queue
 import os
-import shutil
 import tempfile
 import time
 import traceback
@@ -22,7 +21,6 @@
 from dateutil import parser
 import sys
 from logging import getLogger
-import queue
 from typing import Any, Dict, List, Literal, Optional, Tuple, Union, Callable
 
 import snowflake.snowpark
@@ -62,6 +60,7 @@
     DATA_SOURCE_SQL_COMMENT,
     generate_sql_with_predicates,
     output_type_handler,
+    add_unseen_files_to_process_queue,
 )
 from snowflake.snowpark._internal.utils import (
     INFER_SCHEMA_FORMAT_TYPES,
@@ -1271,23 +1270,21 @@ def create_oracledb_connection():
             )
 
             try:
-                with mp.Manager() as process_manager, ProcessPoolExecutor(
+                with ProcessPoolExecutor(
                     max_workers=max_workers
                 ) as process_executor, ThreadPoolExecutor(
                     max_workers=max_workers
                 ) as thread_executor:
                     thread_pool_futures, process_pool_futures = [], []
-                    parquet_file_queue = process_manager.Queue()
 
                     def ingestion_thread_cleanup_callback(parquet_file_path, _):
                         # clean the local temp file after ingestion to avoid consuming too much temp disk space
-                        shutil.rmtree(parquet_file_path, ignore_errors=True)
+                        os.remove(parquet_file_path)
 
                     logger.debug("Starting to fetch data from the data source.")
                     for partition_idx, query in enumerate(partitioned_queries):
                         process_future = process_executor.submit(
                             DataFrameReader._task_fetch_from_data_source_with_retry,
-                            parquet_file_queue,
                             create_connection,
                             query,
                             struct_schema,
@@ -1300,8 +1297,22 @@ def ingestion_thread_cleanup_callback(parquet_file_path, _):
                         )
                         process_pool_futures.append(process_future)
                     # Monitor queue while tasks are running
+
+                    parquet_file_queue = (
+                        queue.Queue()
+                    )  # maintain the queue of parquet files to process
+                    set_of_files_already_added_in_queue = (
+                        set()
+                    )  # maintain file names we have already put into queue
                     while True:
                         try:
+                            # each process and per fetch will create a parquet with a unique file name
+                            # we add unseen files to process queue
+                            add_unseen_files_to_process_queue(
+                                tmp_dir,
+                                set_of_files_already_added_in_queue,
+                                parquet_file_queue,
+                            )
                             file = parquet_file_queue.get_nowait()
                             logger.debug(f"Retrieved file from parquet queue: {file}")
                             thread_future = thread_executor.submit(
@@ -1336,8 +1347,15 @@ def ingestion_thread_cleanup_callback(parquet_file_path, _):
                                 else:
                                     unfinished_process_pool_futures.append(future)
                                     all_job_done = False
-                            if all_job_done and parquet_file_queue.empty():
-                                # all jod is done and parquet file queue is empty, we finished all the fetch work
+                            if (
+                                all_job_done
+                                and parquet_file_queue.empty()
+                                and len(os.listdir(tmp_dir)) == 0
+                            ):
+                                # we finished all the fetch work based on the following 3 conditions:
+                                # 1. all jod is done
+                                # 2. parquet file queue is empty
+                                # 3. no files in the temp work dir as they are all removed in thread future callback
                                 # now we just need to wait for all ingestion threads to complete
                                 logger.debug(
                                     "All jobs are done, and the parquet file queue is empty. Fetching work is complete."
@@ -1537,7 +1555,6 @@ def _upload_and_copy_into_table_with_retry(
 
     @staticmethod
     def _task_fetch_from_data_source(
-        parquet_file_queue: queue.Queue,
         create_connection: Callable[[], "Connection"],
         query: str,
         schema: StructType,
@@ -1554,12 +1571,11 @@ def convert_to_parquet(fetched_data, fetch_idx):
                 logger.debug(
                     f"The DataFrame is empty, no parquet file is generated for partition {partition_idx} fetch {fetch_idx}."
                 )
-                return None
+                return
             path = os.path.join(
                 tmp_dir, f"data_partition{partition_idx}_fetch{fetch_idx}.parquet"
             )
             df.to_parquet(path)
-            return path
 
         conn = create_connection()
         # this is specified to pyodbc, need other way to manage timeout on other drivers
@@ -1573,26 +1589,21 @@ def convert_to_parquet(fetched_data, fetch_idx):
         if fetch_size == 0:
             cursor.execute(query)
             result = cursor.fetchall()
-            parquet_file_path = convert_to_parquet(result, 0)
-            if parquet_file_path:
-                parquet_file_queue.put(parquet_file_path)
+            convert_to_parquet(result, 0)
         elif fetch_size > 0:
             cursor = cursor.execute(query)
             fetch_idx = 0
             while True:
                 rows = cursor.fetchmany(fetch_size)
                 if not rows:
                     break
-                parquet_file_path = convert_to_parquet(rows, fetch_idx)
-                if parquet_file_path:
-                    parquet_file_queue.put(parquet_file_path)
+                convert_to_parquet(rows, fetch_idx)
                 fetch_idx += 1
         else:
             raise ValueError("fetch size cannot be smaller than 0")
 
     @staticmethod
     def _task_fetch_from_data_source_with_retry(
-        parquet_file_queue: queue.Queue,
         create_connection: Callable[[], "Connection"],
         query: str,
         schema: StructType,
@@ -1605,7 +1616,6 @@ def _task_fetch_from_data_source_with_retry(
     ):
         DataFrameReader._retry_run(
             DataFrameReader._task_fetch_from_data_source,
-            parquet_file_queue,
             create_connection,
             query,
             schema,
diff --git a/tests/integ/test_data_source_api.py b/tests/integ/test_data_source_api.py
@@ -4,7 +4,6 @@
 import functools
 import math
 import os
-import queue
 import tempfile
 import time
 import datetime
@@ -152,7 +151,6 @@ def test_dbapi_retry(session):
             SnowparkDataframeReaderException, match="\\[RuntimeError\\] Test error"
         ):
             DataFrameReader._task_fetch_from_data_source_with_retry(
-                parquet_file_queue=queue.Queue(),
                 create_connection=sql_server_create_connection,
                 query="SELECT * FROM test_table",
                 schema=StructType([StructField("col1", IntegerType(), False)]),
@@ -528,7 +526,6 @@ def test_negative_case(session):
 def test_task_fetch_from_data_source_with_fetch_size(
     fetch_size, partition_idx, expected_error
 ):
-    parquet_file_queue = queue.Queue()
     schema = infer_data_source_schema(
         sql_server_create_connection_small_data(),
         SQL_SERVER_TABLE_NAME,
@@ -544,7 +541,6 @@ def test_task_fetch_from_data_source_with_fetch_size(
     with tempfile.TemporaryDirectory() as tmp_dir:
 
         params = {
-            "parquet_file_queue": parquet_file_queue,
             "create_connection": sql_server_create_connection_small_data,
             "query": "SELECT * FROM test_table",
             "schema": schema,
@@ -562,16 +558,12 @@ def test_task_fetch_from_data_source_with_fetch_size(
                 DataFrameReader._task_fetch_from_data_source(**params)
         else:
             DataFrameReader._task_fetch_from_data_source(**params)
-
-            file_idx = 0
-            while not parquet_file_queue.empty():
-                file_path = parquet_file_queue.get()
+            files = sorted(os.listdir(tmp_dir))
+            for idx, file in enumerate(files):
                 assert (
-                    f"data_partition{partition_idx}_fetch{file_idx}.parquet"
-                    in file_path
-                )
-                file_idx += 1
-            assert file_idx == file_count
+                    f"data_partition{partition_idx}_fetch{idx}.parquet" in file
+                ), f"file: {file} does not match"
+            assert len(files) == file_count
 
 
 def test_database_detector():