snowflakedb
diff --git a/‎arrow_iterator.pyx‎
Lines changed: 53 additions & 17 deletions b/‎arrow_iterator.pyx‎
Lines changed: 53 additions & 17 deletions
diff --git a/‎arrow_result.pyx‎
Lines changed: 98 additions & 3 deletions b/‎arrow_result.pyx‎
Lines changed: 98 additions & 3 deletions
diff --git a/‎chunk_downloader.py‎
Lines changed: 2 additions & 2 deletions b/‎chunk_downloader.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/ArrowIterator/CArrowChunkIterator.cpp‎
Lines changed: 3 additions & 5 deletions b/‎cpp/ArrowIterator/CArrowChunkIterator.cpp‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎cpp/ArrowIterator/CArrowChunkIterator.hpp‎
Lines changed: 5 additions & 15 deletions b/‎cpp/ArrowIterator/CArrowChunkIterator.hpp‎
Lines changed: 5 additions & 15 deletions
diff --git a/‎cpp/ArrowIterator/CArrowIterator.cpp‎
Lines changed: 19 additions & 0 deletions b/‎cpp/ArrowIterator/CArrowIterator.cpp‎
Lines changed: 19 additions & 0 deletions
@@ -9,39 +9,75 @@ from cpython.ref cimport PyObject
 
 logger = getLogger(__name__)
 
-cdef extern from "cpp/ArrowIterator/CArrowChunkIterator.hpp" namespace "sf":
-    cdef cppclass CArrowChunkIterator:
-        CArrowChunkIterator(PyObject* context)
+'''
+the unit in this iterator
+EMPTY_UNIT: default
+ROW_UNIT: fetch row by row if the user call `fetchone()`
+TABLE_UNIT: fetch one arrow table if the user call `fetch_pandas()`
+'''
+ROW_UNIT, TABLE_UNIT, EMPTY_UNIT = 'row', 'table', ''
+
 
+cdef extern from "cpp/ArrowIterator/CArrowIterator.hpp" namespace "sf":
+    cdef cppclass CArrowIterator:
         void addRecordBatch(PyObject * rb)
 
-        PyObject *nextRow();
+        PyObject* next();
 
         void reset();
 
 
-cdef class PyArrowChunkIterator:
-    cdef CArrowChunkIterator* cIterator
+cdef extern from "cpp/ArrowIterator/CArrowChunkIterator.hpp" namespace "sf":
+    cdef cppclass CArrowChunkIterator(CArrowIterator):
+        CArrowChunkIterator(PyObject* context) except +
+
+
+cdef extern from "cpp/ArrowIterator/CArrowTableIterator.hpp" namespace "sf":
+    cdef cppclass CArrowTableIterator(CArrowIterator):
+        CArrowTableIterator(PyObject* context) except +
+
+
+cdef class PyArrowIterator:
+    cdef object reader
+    cdef object context
+    cdef CArrowIterator* cIterator
+    cdef str unit
     cdef PyObject* cret
 
-    def __cinit__(PyArrowChunkIterator self, object arrow_stream_reader, object arrow_context):
-        self.cIterator = new CArrowChunkIterator(<PyObject*>arrow_context)
-        for rb in arrow_stream_reader:
-            self.cIterator.addRecordBatch(<PyObject*>rb)
-        self.cIterator.reset()
+    def __cinit__(self, object arrow_stream_reader, object arrow_context):
+        self.reader = arrow_stream_reader
+        self.context = arrow_context
+        self.cIterator = NULL
+        self.unit = ''
 
-    def __dealloc__(PyArrowChunkIterator self):
+    def __dealloc__(self):
         del self.cIterator
 
-    def __next__(PyArrowChunkIterator self):
-        cret = self.cIterator.nextRow()
-        if not cret:
-            logger.error("Internal error from CArrowChunkIterator\n")
+    def __next__(self):
+        self.cret = self.cIterator.next()
+
+        if not self.cret:
+            logger.error("Internal error from CArrowIterator\n")
             # it looks like this line can help us get into python and detect the global variable immediately
             # however, this log will not show up for unclear reason
-        ret = <object>cret
+        ret = <object>self.cret
 
         if ret is None:
             raise StopIteration
         else:
             return ret
+
+    def init(self, str iter_unit):
+        # init chunk (row) iterator or table iterator
+        if iter_unit != ROW_UNIT and iter_unit != TABLE_UNIT:
+            raise NotImplementedError
+        elif iter_unit == ROW_UNIT:
+            self.cIterator = new CArrowChunkIterator(<PyObject*>self.context)
+        elif iter_unit == TABLE_UNIT:
+            self.cIterator = new CArrowTableIterator(<PyObject*>self.context)
+        self.unit = iter_unit
+
+        # read
+        for rb in self.reader:
+            self.cIterator.addRecordBatch(<PyObject*>rb)
+        self.cIterator.reset()
@@ -10,7 +10,8 @@ from .telemetry import TelemetryField
 from .time_util import get_time_millis
 try:
     from pyarrow.ipc import open_stream
-    from .arrow_iterator import PyArrowChunkIterator
+    from pyarrow import concat_tables
+    from .arrow_iterator import PyArrowIterator, ROW_UNIT, TABLE_UNIT, EMPTY_UNIT
     from .arrow_context import ArrowConverterContext
 except ImportError:
     pass
@@ -32,6 +33,7 @@ cdef class ArrowResult:
         object _current_chunk_row
         object _chunk_downloader
         object _arrow_context
+        str _iter_unit
 
     def __init__(self, raw_response, cursor):
         self._reset()
@@ -51,9 +53,10 @@ cdef class ArrowResult:
             arrow_bytes = b64decode(rowset_b64)
             arrow_reader = open_stream(arrow_bytes)
             self._arrow_context = ArrowConverterContext(self._connection._session_parameters)
-            self._current_chunk_row = PyArrowChunkIterator(arrow_reader, self._arrow_context)
+            self._current_chunk_row = PyArrowIterator(arrow_reader, self._arrow_context)
         else:
-            self._current_chunk_row = iter([])
+            self._current_chunk_row = iter(())
+        self._iter_unit = EMPTY_UNIT
 
         if u'chunks' in data:
             chunks = data[u'chunks']
@@ -83,6 +86,13 @@ cdef class ArrowResult:
         return self
 
     def __next__(self):
+        if self._iter_unit == EMPTY_UNIT:
+            self._iter_unit = ROW_UNIT
+            self._current_chunk_row.init(self._iter_unit)
+        elif self._iter_unit == TABLE_UNIT:
+            logger.debug(u'The iterator has been built for fetching arrow table')
+            raise RuntimeError
+
         is_done = False
         try:
             row = None
@@ -96,6 +106,7 @@ cdef class ArrowResult:
                         self._chunk_index, self._chunk_count)
                     next_chunk = self._chunk_downloader.next_chunk()
                     self._current_chunk_row = next_chunk.result_data
+                    self._current_chunk_row.init(self._iter_unit)
                     self._chunk_index += 1
                     try:
                         row = self._current_chunk_row.__next__()
@@ -146,4 +157,88 @@ cdef class ArrowResult:
         self._chunk_count = 0
         self._chunk_downloader = None
         self._arrow_context = None
+        self._iter_unit = EMPTY_UNIT
+
+    def _fetch_arrow_batches(self):
+        '''
+            Fetch Arrow Table in batch, where 'batch' refers to Snowflake Chunk
+            Thus, the batch size (the number of rows in table) may be different
+        '''
+        if self._iter_unit == EMPTY_UNIT:
+            self._iter_unit = TABLE_UNIT
+        elif self._iter_unit == ROW_UNIT:
+            logger.debug(u'The iterator has been built for fetching row')
+            raise RuntimeError
+
+        try:
+            self._current_chunk_row.init(self._iter_unit) # AttributeError if it is iter(())
+            while self._chunk_index <= self._chunk_count:
+                table = self._current_chunk_row.__next__()
+                if self._chunk_index < self._chunk_count: # multiple chunks
+                    logger.debug(
+                        u"chunk index: %s, chunk_count: %s",
+                        self._chunk_index, self._chunk_count)
+                    next_chunk = self._chunk_downloader.next_chunk()
+                    self._current_chunk_row = next_chunk.result_data
+                    self._current_chunk_row.init(self._iter_unit)
+                self._chunk_index += 1
+                yield table
+            else:
+                if self._chunk_count > 0 and \
+                        self._chunk_downloader is not None:
+                    self._chunk_downloader.terminate()
+                    self._cursor._log_telemetry_job_data(
+                        TelemetryField.TIME_DOWNLOADING_CHUNKS,
+                        self._chunk_downloader._total_millis_downloading_chunks)
+                    self._cursor._log_telemetry_job_data(
+                        TelemetryField.TIME_PARSING_CHUNKS,
+                        self._chunk_downloader._total_millis_parsing_chunks)
+                self._chunk_downloader = None
+                self._chunk_count = 0
+                self._current_chunk_row = iter(())
+        except AttributeError:
+            # just for handling the case of empty result
+            return None
+        finally:
+            if self._cursor._first_chunk_time:
+                logger.info("fetching data into pandas dataframe done")
+                time_consume_last_result = get_time_millis() - self._cursor._first_chunk_time
+                self._cursor._log_telemetry_job_data(
+                    TelemetryField.TIME_CONSUME_LAST_RESULT,
+                    time_consume_last_result)
 
+    def _fetch_arrow_all(self):
+        '''
+            Fetch a single Arrow Table
+        '''
+        tables = list(self._fetch_arrow_batches())
+        if tables:
+            return concat_tables(tables)
+        else:
+            return None
+
+    def _fetch_pandas_batches(self):
+        '''
+            Fetch Pandas dataframes in batch, where 'batch' refers to Snowflake Chunk
+            Thus, the batch size (the number of rows in dataframe) may be different
+            TODO: take a look at pyarrow to_pandas() API, which provides some useful arguments
+            e.g. 1. use `use_threads=true` for acceleration
+                 2. use `strings_to_categorical` and `categories` to encoding categorical data,
+                    which is really different from `string` in data science.
+                    For example, some data may be marked as 0 and 1 as binary class in dataset,
+                    the user wishes to interpret as categorical data instead of integer.
+                 3. use `zero_copy_only` to capture the potential unnecessary memory copying
+            we'd better also provide these handy arguments to make data scientists happy :)
+        '''
+        for table in self._fetch_arrow_batches():
+            yield table.to_pandas()
+
+    def _fetch_pandas_all(self):
+        '''
+            Fetch a single Pandas dataframe
+        '''
+        table = self._fetch_arrow_all()
+        if table:
+            return table.to_pandas()
+        else:
+            return None
@@ -20,7 +20,7 @@
 
 try:
     from pyarrow.ipc import open_stream
-    from .arrow_iterator import PyArrowChunkIterator
+    from .arrow_iterator import PyArrowIterator
     from .arrow_context import ArrowConverterContext
 except ImportError:
     pass
@@ -332,5 +332,5 @@ def __init__(self, meta, connection):
     def to_iterator(self, raw_data_fd, download_time):
         gzip_decoder = GzipFile(fileobj=raw_data_fd, mode='r')
         reader = open_stream(gzip_decoder)
-        it = PyArrowChunkIterator(reader, self._arrow_context)
+        it = PyArrowIterator(reader, self._arrow_context)
         return it
@@ -16,7 +16,6 @@
 
 namespace sf
 {
-Logger CArrowChunkIterator::logger("snowflake.connector.CArrowChunkIterator");
 
 CArrowChunkIterator::CArrowChunkIterator(PyObject* context)
 : m_latestReturnedRow(nullptr), m_context(context)
@@ -25,9 +24,8 @@ CArrowChunkIterator::CArrowChunkIterator(PyObject* context)
 
 void CArrowChunkIterator::addRecordBatch(PyObject* rb)
 {
-  std::shared_ptr<arrow::RecordBatch> cRecordBatch;
-  arrow::Status status = arrow::py::unwrap_record_batch(rb, &cRecordBatch);
-  m_cRecordBatches.push_back(cRecordBatch);
+  // may add some specific behaviors for this iterator
+  CArrowIterator::addRecordBatch(rb);
 }
 
 void CArrowChunkIterator::reset()
@@ -43,7 +41,7 @@ void CArrowChunkIterator::reset()
               m_columnCount);
 }
 
-PyObject* CArrowChunkIterator::nextRow()
+PyObject* CArrowChunkIterator::next()
 {
   m_rowIndexInBatch++;
 
 
@@ -4,13 +4,8 @@
 #ifndef PC_ARROWCHUNKITERATOR_HPP
 #define PC_ARROWCHUNKITERATOR_HPP
 
-#include <Python.h>
-#include <vector>
-#include <arrow/python/platform.h>
-#include <arrow/api.h>
-#include <arrow/python/pyarrow.h>
+#include "CArrowIterator.hpp"
 #include "IColumnConverter.hpp"
-#include "logging.hpp"
 #include "Python/Common.hpp"
 
 namespace sf
@@ -21,7 +16,7 @@ namespace sf
  * iterator object)
  * will ask for nextRow to be returned back to Python
  */
-class CArrowChunkIterator
+class CArrowChunkIterator : public CArrowIterator
 {
 public:
   /**
@@ -38,19 +33,16 @@ class CArrowChunkIterator
    * Add Arrow RecordBach to current chunk
    * @param rb recordbatch to be added
    */
-  void addRecordBatch(PyObject* rb);
+  void addRecordBatch(PyObject* rb) override;
 
   /**
    * @return a python tuple object which contains all data in current row
    */
-  PyObject* nextRow();
+  PyObject* next() override;
 
-  void reset();
+  void reset() override;
 
 private:
-  /** list of all record batch in current chunk */
-  std::vector<std::shared_ptr<arrow::RecordBatch>> m_cRecordBatches;
-
   /** number of columns */
   int m_columnCount;
 
@@ -80,8 +72,6 @@ class CArrowChunkIterator
    */
   void currentRowAsTuple();
 
-  static Logger logger;
-
   void initColumnConverters();
 };
 }
 
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2013-2019 Snowflake Computing
+ */
+
+#include "CArrowIterator.hpp"
+
+namespace sf
+{
+
+Logger CArrowIterator::logger("snowflake.connector.CArrowIterator");
+
+void CArrowIterator::addRecordBatch(PyObject* rb)
+{
+  std::shared_ptr<arrow::RecordBatch> cRecordBatch;
+  arrow::Status status = arrow::py::unwrap_record_batch(rb, &cRecordBatch);
+  m_cRecordBatches.push_back(cRecordBatch);
+}
+
+}
Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,6 @@`
`16`	`16`
`17`	`17`	`namespace sf`
`18`	`18`	`{`
`19`		`-Logger CArrowChunkIterator::logger("snowflake.connector.CArrowChunkIterator");`
`20`	`19`
`21`	`20`	`CArrowChunkIterator::CArrowChunkIterator(PyObject* context)`
`22`	`21`	`: m_latestReturnedRow(nullptr), m_context(context)`
`@@ -25,9 +24,8 @@ CArrowChunkIterator::CArrowChunkIterator(PyObject* context)`
`25`	`24`
`26`	`25`	`void CArrowChunkIterator::addRecordBatch(PyObject* rb)`
`27`	`26`	`{`
`28`		`- std::shared_ptr<arrow::RecordBatch> cRecordBatch;`
`29`		`- arrow::Status status = arrow::py::unwrap_record_batch(rb, &cRecordBatch);`
`30`		`- m_cRecordBatches.push_back(cRecordBatch);`
	`27`	`+ // may add some specific behaviors for this iterator`
	`28`	`+ CArrowIterator::addRecordBatch(rb);`
`31`	`29`	`}`
`32`	`30`
`33`	`31`	`void CArrowChunkIterator::reset()`
`@@ -43,7 +41,7 @@ void CArrowChunkIterator::reset()`
`43`	`41`	`m_columnCount);`
`44`	`42`	`}`
`45`	`43`
`46`		`-PyObject* CArrowChunkIterator::nextRow()`
	`44`	`+PyObject* CArrowChunkIterator::next()`
`47`	`45`	`{`
`48`	`46`	`m_rowIndexInBatch++;`
`49`	`47`