SNOW-105228: Pandas fetch API did not handle the case that first chunk is empty correctly

sfc-gh-stakeda · ankit-bhatnagar167 · commit 63d182e8c30b · 2019-11-01T13:09:20.000-07:00
diff --git a/arrow_iterator.pyx b/arrow_iterator.pyx
@@ -12,6 +12,8 @@ from libcpp cimport bool as c_bool
 from libcpp.memory cimport shared_ptr
 from libcpp.string cimport string as c_string
 from libcpp.vector cimport vector
+from .errors import (Error, OperationalError)
+from .errorcode import ER_FAILED_TO_READ_ARROW_STREAM
 
 logger = getLogger(__name__)
 
@@ -140,15 +142,36 @@ cdef class PyArrowIterator(EmptyPyArrowIterator):
         cdef shared_ptr[CRecordBatchReader] reader
         cdef shared_ptr[CRecordBatch] record_batch
         input_stream.reset(new PyReadableFile(py_inputstream))
-        CRecordBatchStreamReader.Open(input_stream.get(), &reader)
+        cdef CStatus ret = CRecordBatchStreamReader.Open(input_stream.get(), &reader)
+        if not ret.ok():
+            Error.errorhandler_wrapper(
+                None,
+                None,
+                OperationalError,
+                {
+                    u'msg': u'Failed to open arrow stream: ' + ret.message(),
+                    u'errno': ER_FAILED_TO_READ_ARROW_STREAM
+                })
+
         while True:
-            reader.get().ReadNext(&record_batch)
+            ret = reader.get().ReadNext(&record_batch)
+            if not ret.ok():
+                Error.errorhandler_wrapper(
+                    None,
+                    None,
+                    OperationalError,
+                    {
+                        u'msg': u'Failed to read next arrow batch: ' + ret.message(),
+                        u'errno': ER_FAILED_TO_READ_ARROW_STREAM
+                    })
 
             if record_batch.get() is NULL:
                 break
 
             self.batches.push_back(record_batch)
 
+        logger.debug("Batches read: %d", self.batches.size())
+
         self.context = arrow_context
         self.cIterator = NULL
         self.unit = ''
diff --git a/arrow_result.pyx b/arrow_result.pyx
@@ -36,13 +36,13 @@ cdef class ArrowResult:
         object _arrow_context
         str _iter_unit
 
-    def __init__(self, raw_response, cursor):
+    def __init__(self, raw_response, cursor, _chunk_downloader=None):
         self._reset()
         self._cursor = cursor
         self._connection = cursor.connection
-        self._chunk_info(raw_response)
+        self._chunk_info(raw_response, _chunk_downloader)
 
-    def _chunk_info(self, data):
+    def _chunk_info(self, data, _chunk_downloader=None):
         self.total_row_index = -1  # last fetched number of rows
 
         self._chunk_index = 0
@@ -55,6 +55,7 @@ cdef class ArrowResult:
             self._arrow_context = ArrowConverterContext(self._connection._session_parameters)
             self._current_chunk_row = PyArrowIterator(io.BytesIO(arrow_bytes), self._arrow_context)
         else:
+            logger.debug("Data from first gs response is empty")
             self._current_chunk_row = EmptyPyArrowIterator(None, None)
         self._iter_unit = EMPTY_UNIT
 
@@ -76,11 +77,12 @@ cdef class ArrowResult:
                         header_value)
 
             logger.debug(u'qrmk=%s', qrmk)
-            self._chunk_downloader = self._connection._chunk_downloader_class(
-                chunks, self._connection, self._cursor, qrmk, chunk_headers,
-                query_result_format='arrow',
-                prefetch_threads=self._connection.client_prefetch_threads,
-                use_ijson=False)
+            self._chunk_downloader = _chunk_downloader if _chunk_downloader \
+                else self._connection._chunk_downloader_class(
+                    chunks, self._connection, self._cursor, qrmk, chunk_headers,
+                    query_result_format='arrow',
+                    prefetch_threads=self._connection.client_prefetch_threads,
+                    use_ijson=False)
 
     def __iter__(self):
         return self
@@ -171,9 +173,16 @@ cdef class ArrowResult:
             raise RuntimeError
 
         try:
-            self._current_chunk_row.init(self._iter_unit) # AttributeError if it is iter(())
+            self._current_chunk_row.init(self._iter_unit)
+            logger.debug(u'Init table iterator successfully, current chunk index: %s, '
+                         u'chunk count: %s', self._chunk_index, self._chunk_count)
             while self._chunk_index <= self._chunk_count:
-                table = self._current_chunk_row.__next__()
+                stop_iteration_except = False
+                try:
+                    table = self._current_chunk_row.__next__()
+                except StopIteration:
+                    stop_iteration_except = True
+
                 if self._chunk_index < self._chunk_count: # multiple chunks
                     logger.debug(
                         u"chunk index: %s, chunk_count: %s",
@@ -182,7 +191,11 @@ cdef class ArrowResult:
                     self._current_chunk_row = next_chunk.result_data
                     self._current_chunk_row.init(self._iter_unit)
                 self._chunk_index += 1
-                yield table
+
+                if stop_iteration_except:
+                    continue
+                else:
+                    yield table
             else:
                 if self._chunk_count > 0 and \
                         self._chunk_downloader is not None:
@@ -196,9 +209,6 @@ cdef class ArrowResult:
                 self._chunk_downloader = None
                 self._chunk_count = 0
                 self._current_chunk_row = EmptyPyArrowIterator(None, None)
-        except AttributeError:
-            # just for handling the case of empty result
-            return None
         finally:
             if self._cursor._first_chunk_time:
                 logger.info("fetching data into pandas dataframe done")
diff --git a/cursor.py b/cursor.py
@@ -652,7 +652,7 @@ def check_can_use_arrow_resultset(self):
                 }
             )
 
-    def check_can_use_panadas(self):
+    def check_can_use_pandas(self):
         global pyarrow
 
         if pyarrow is None:
@@ -707,7 +707,7 @@ def fetch_pandas_batches(self, **kwargs):
         Fetch a single Arrow Table
         @param kwargs: will be passed to pyarrow.Table.to_pandas() method
         """
-        self.check_can_use_panadas()
+        self.check_can_use_pandas()
         if self._query_result_format != 'arrow':  # TODO: or pandas isn't imported
             raise NotSupportedError
         for df in self._result._fetch_pandas_batches(**kwargs):
@@ -718,7 +718,7 @@ def fetch_pandas_all(self, **kwargs):
         Fetch Pandas dataframes in batch, where 'batch' refers to Snowflake Chunk
         @param kwargs: will be passed to pyarrow.Table.to_pandas() method
         """
-        self.check_can_use_panadas()
+        self.check_can_use_pandas()
         if self._query_result_format != 'arrow':
             raise NotSupportedError
         return self._result._fetch_pandas_all(**kwargs)
diff --git a/errorcode.py b/errorcode.py
@@ -75,3 +75,4 @@
 ER_NO_PYARROW = 255002
 ER_NO_ARROW_RESULT = 255003
 ER_NO_PYARROW_SNOWSQL = 255004
+ER_FAILED_TO_READ_ARROW_STREAM = 25005
diff --git a/test/test_unit_arrow_result.py b/test/test_unit_arrow_result.py