SNOW-119348: support dictionary cursor for ARROW format result set

sfc-gh-stakeda · ankit-bhatnagar167 · commit 705341ede670 · 2019-12-02T15:39:53.000-08:00
diff --git a/arrow_iterator.pyx b/arrow_iterator.pyx
@@ -35,6 +35,9 @@ cdef extern from "cpp/ArrowIterator/CArrowChunkIterator.hpp" namespace "sf":
     cdef cppclass CArrowChunkIterator(CArrowIterator):
         CArrowChunkIterator(PyObject* context, vector[shared_ptr[CRecordBatch]]* batches) except +
 
+    cdef cppclass DictCArrowChunkIterator(CArrowChunkIterator):
+        DictCArrowChunkIterator(PyObject* context, vector[shared_ptr[CRecordBatch]]* batches) except +
+
 
 cdef extern from "cpp/ArrowIterator/CArrowTableIterator.hpp" namespace "sf":
     cdef cppclass CArrowTableIterator(CArrowIterator):
@@ -117,11 +120,6 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
 
 
 cdef class EmptyPyArrowIterator:
-    def __cinit__(self, object arrow_stream_reader, object arrow_context):
-        pass
-
-    def __dealloc__(self):
-        pass
 
     def __next__(self):
        raise StopIteration
@@ -136,8 +134,9 @@ cdef class PyArrowIterator(EmptyPyArrowIterator):
     cdef str unit
     cdef PyObject* cret
     cdef vector[shared_ptr[CRecordBatch]] batches
+    cdef object use_dict_result
 
-    def __cinit__(self, object py_inputstream, object arrow_context):
+    def __cinit__(self, object py_inputstream, object arrow_context, object use_dict_result):
         cdef shared_ptr[InputStream] input_stream
         cdef shared_ptr[CRecordBatchReader] reader
         cdef shared_ptr[CRecordBatch] record_batch
@@ -175,6 +174,7 @@ cdef class PyArrowIterator(EmptyPyArrowIterator):
         self.context = arrow_context
         self.cIterator = NULL
         self.unit = ''
+        self.use_dict_result = use_dict_result
 
     def __dealloc__(self):
         del self.cIterator
@@ -198,7 +198,9 @@ cdef class PyArrowIterator(EmptyPyArrowIterator):
         if iter_unit != ROW_UNIT and iter_unit != TABLE_UNIT:
             raise NotImplementedError
         elif iter_unit == ROW_UNIT:
-            self.cIterator = new CArrowChunkIterator(<PyObject*>self.context, &self.batches)
+            self.cIterator = new CArrowChunkIterator(<PyObject*>self.context, &self.batches) if not self.use_dict_result \
+                else new DictCArrowChunkIterator(<PyObject*>self.context, &self.batches)
+
         elif iter_unit == TABLE_UNIT:
             self.cIterator = new CArrowTableIterator(<PyObject*>self.context, &self.batches)
         self.unit = iter_unit
diff --git a/arrow_result.pyx b/arrow_result.pyx
@@ -6,6 +6,7 @@
 # cython: language_level=3
 
 from base64 import b64decode
+from libcpp cimport bool
 import io
 from logging import getLogger
 from .telemetry import TelemetryField
@@ -35,11 +36,14 @@ cdef class ArrowResult:
         object _chunk_downloader
         object _arrow_context
         str _iter_unit
+        object _use_dict_result
 
-    def __init__(self, raw_response, cursor, _chunk_downloader=None):
+
+    def __init__(self, raw_response, cursor, use_dict_result=False, _chunk_downloader=None):
         self._reset()
         self._cursor = cursor
         self._connection = cursor.connection
+        self._use_dict_result = use_dict_result
         self._chunk_info(raw_response, _chunk_downloader)
 
     def _chunk_info(self, data, _chunk_downloader=None):
@@ -53,10 +57,10 @@ cdef class ArrowResult:
         if rowset_b64:
             arrow_bytes = b64decode(rowset_b64)
             self._arrow_context = ArrowConverterContext(self._connection._session_parameters)
-            self._current_chunk_row = PyArrowIterator(io.BytesIO(arrow_bytes), self._arrow_context)
+            self._current_chunk_row = PyArrowIterator(io.BytesIO(arrow_bytes), self._arrow_context, self._use_dict_result)
         else:
             logger.debug("Data from first gs response is empty")
-            self._current_chunk_row = EmptyPyArrowIterator(None, None)
+            self._current_chunk_row = EmptyPyArrowIterator()
         self._iter_unit = EMPTY_UNIT
 
         if u'chunks' in data:
@@ -127,7 +131,7 @@ cdef class ArrowResult:
                             self._chunk_downloader._total_millis_parsing_chunks)
                     self._chunk_downloader = None
                     self._chunk_count = 0
-                    self._current_chunk_row = EmptyPyArrowIterator(None, None)
+                    self._current_chunk_row = EmptyPyArrowIterator()
                     is_done = True
 
             if is_done:
@@ -149,7 +153,7 @@ cdef class ArrowResult:
     def _reset(self):
         self.total_row_index = -1  # last fetched number of rows
         self._current_chunk_row_count = 0
-        self._current_chunk_row = EmptyPyArrowIterator(None, None)
+        self._current_chunk_row = EmptyPyArrowIterator()
         self._chunk_index = 0
 
         if hasattr(self, u'_chunk_count') and self._chunk_count > 0 and \
@@ -208,7 +212,7 @@ cdef class ArrowResult:
                         self._chunk_downloader._total_millis_parsing_chunks)
                 self._chunk_downloader = None
                 self._chunk_count = 0
-                self._current_chunk_row = EmptyPyArrowIterator(None, None)
+                self._current_chunk_row = EmptyPyArrowIterator()
         finally:
             if self._cursor._first_chunk_time:
                 logger.info("fetching data into pandas dataframe done")
diff --git a/chunk_downloader.py b/chunk_downloader.py
@@ -249,7 +249,7 @@ def _fetch_chunk(self, url, headers):
         handler = JsonBinaryHandler(is_raw_binary_iterator=True,
                                     use_ijson=self._use_ijson) \
             if self._query_result_format == 'json' else \
-            ArrowBinaryHandler(self._cursor.description, self._connection)
+            ArrowBinaryHandler(self._cursor, self._connection)
 
         return self._connection.rest.fetch(
             u'get', url, headers,
@@ -316,8 +316,8 @@ def to_iterator(self, raw_data_fd, download_time):
 
 class ArrowBinaryHandler(RawBinaryDataHandler):
 
-    def __init__(self, meta, connection):
-        self._meta = meta
+    def __init__(self, cursor, connection):
+        self._cursor = cursor
         self._arrow_context = ArrowConverterContext(connection._session_parameters)
 
     """
@@ -326,5 +326,5 @@ def __init__(self, meta, connection):
     def to_iterator(self, raw_data_fd, download_time):
         from .arrow_iterator import PyArrowIterator
         gzip_decoder = GzipFile(fileobj=raw_data_fd, mode='r')
-        it = PyArrowIterator(gzip_decoder, self._arrow_context)
+        it = PyArrowIterator(gzip_decoder, self._arrow_context, self._cursor._use_dict_result)
         return it
diff --git a/cpp/ArrowIterator/CArrowChunkIterator.cpp b/cpp/ArrowIterator/CArrowChunkIterator.cpp
@@ -37,7 +37,7 @@ PyObject* CArrowChunkIterator::next()
 
   if (m_rowIndexInBatch < m_rowCountInBatch)
   {
-    this->currentRowAsTuple();
+    this->createRowPyObject();
     if (py::checkPyError())
     {
       return nullptr;
@@ -60,7 +60,7 @@ PyObject* CArrowChunkIterator::next()
       logger.debug("Current batch index: %d, rows in current batch: %d",
                   m_currentBatchIndex, m_rowCountInBatch);
 
-      this->currentRowAsTuple();
+      this->createRowPyObject();
       if (py::checkPyError())
       {
         return nullptr;
@@ -74,7 +74,7 @@ PyObject* CArrowChunkIterator::next()
   return Py_None;
 }
 
-void CArrowChunkIterator::currentRowAsTuple()
+void CArrowChunkIterator::createRowPyObject()
 {
   m_latestReturnedRow.reset(PyTuple_New(m_columnCount));
   for (int i = 0; i < m_columnCount; i++)
@@ -91,13 +91,13 @@ void CArrowChunkIterator::initColumnConverters()
   m_currentBatchConverters.clear();
   std::shared_ptr<arrow::RecordBatch> currentBatch =
       (*m_cRecordBatches)[m_currentBatchIndex];
-  std::shared_ptr<arrow::Schema> schema = currentBatch->schema();
+  m_currentSchema = currentBatch->schema();
   for (int i = 0; i < currentBatch->num_columns(); i++)
   {
     std::shared_ptr<arrow::Array> columnArray = currentBatch->column(i);
-    std::shared_ptr<arrow::DataType> dt = schema->field(i)->type();
+    std::shared_ptr<arrow::DataType> dt = m_currentSchema->field(i)->type();
     std::shared_ptr<const arrow::KeyValueMetadata> metaData =
-        schema->field(i)->metadata();
+        m_currentSchema->field(i)->metadata();
     SnowflakeType::Type st = SnowflakeType::snowflakeTypeFromString(
         metaData->value(metaData->FindKey("logicalType")));
 
@@ -407,4 +407,22 @@ void CArrowChunkIterator::initColumnConverters()
   }
 }
 
+DictCArrowChunkIterator::DictCArrowChunkIterator(PyObject* context,
+                                                 std::vector<std::shared_ptr<arrow::RecordBatch>> * batches)
+: CArrowChunkIterator(context, batches)
+{
+}
+
+void DictCArrowChunkIterator::createRowPyObject()
+{
+  m_latestReturnedRow.reset(PyDict_New());
+  for (int i = 0; i < m_currentSchema->num_fields(); i++)
+  {
+    PyDict_SetItemString(
+        m_latestReturnedRow.get(), m_currentSchema->field(i)->name().c_str(),
+        m_currentBatchConverters[i]->toPyObject(m_rowIndexInBatch));
+  }
+  return;
+}
+
 }  // namespace sf
diff --git a/cpp/ArrowIterator/CArrowChunkIterator.hpp b/cpp/ArrowIterator/CArrowChunkIterator.hpp
@@ -34,6 +34,24 @@ class CArrowChunkIterator : public CArrowIterator
    */
   PyObject* next() override;
 
+protected:
+  /**
+   * @return python object of tuple which is tuple of all row values
+   */
+  virtual void createRowPyObject();
+
+  /** pointer to the latest returned python tuple(row) result */
+  py::UniqueRef m_latestReturnedRow;
+
+  /** list of column converters*/
+  std::vector<std::shared_ptr<sf::IColumnConverter>> m_currentBatchConverters;
+
+  /** row index inside current record batch (start from 0) */
+  int m_rowIndexInBatch;
+
+  /** schema of current record batch */
+  std::shared_ptr<arrow::Schema> m_currentSchema;
+
 private:
   /** number of columns */
   int m_columnCount;
@@ -44,28 +62,29 @@ class CArrowChunkIterator : public CArrowIterator
   /** current index that iterator points to */
   int m_currentBatchIndex;
 
-  /** row index inside current record batch (start from 0) */
-  int m_rowIndexInBatch;
-
   /** total number of rows inside current record batch */
   int64_t m_rowCountInBatch;
 
-  /** pointer to the latest returned python tuple(row) result */
-  py::UniqueRef m_latestReturnedRow;
-
-  /** list of column converters*/
-  std::vector<std::shared_ptr<sf::IColumnConverter>> m_currentBatchConverters;
-
   /** arrow format convert context for the current session */
   PyObject* m_context;
 
-  /**
-   * @return python object of tuple which is tuple of all row values
-   */
-  void currentRowAsTuple();
-
   void initColumnConverters();
 };
+
+class DictCArrowChunkIterator : public CArrowChunkIterator
+{
+public:
+  DictCArrowChunkIterator(PyObject* context, std::vector<std::shared_ptr<arrow::RecordBatch>> *);
+
+  ~DictCArrowChunkIterator() = default;
+
+private:
+
+  void createRowPyObject() override;
+
+};
+
+
 }
 
 #endif  // PC_ARROWCHUNKITERATOR_HPP
diff --git a/cursor.py b/cursor.py
@@ -82,7 +82,14 @@ class SnowflakeCursor(object):
         u(r'alter\s+session\s+set\s+(.*)=\'?([^\']+)\'?\s*;'),
         flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
 
-    def __init__(self, connection, json_result_class=JsonResult):
+    def __init__(self, connection, use_dict_result=False, json_result_class=JsonResult):
+        """
+        :param connection: connection created this cursor
+        :param use_dict_result: whether use dict result or not. This variable only applied to
+                                arrow result. When result in json, json_result_class will be
+                                honored
+        :param json_result_class: class that used in json result
+        """
         self._connection = connection
 
         self._errorhandler = Error.default_errorhandler
@@ -106,6 +113,7 @@ def __init__(self, connection, json_result_class=JsonResult):
         self._timezone = None
         self._binary_output_format = None
         self._result = None
+        self._use_dict_result = use_dict_result
         self._json_result_class = json_result_class
 
         self._arraysize = 1  # PEP-0249: defaults to 1
@@ -623,7 +631,7 @@ def _init_result_and_meta(self, data, use_ijson=False):
 
         if self._query_result_format == 'arrow':
             self.check_can_use_arrow_resultset()
-            self._result = ArrowResult(data, self)
+            self._result = ArrowResult(data, self, use_dict_result=self._use_dict_result)
         else:
             self._result = self._json_result_class(data, self, use_ijson)
 
@@ -944,4 +952,4 @@ class DictCursor(SnowflakeCursor):
     """
 
     def __init__(self, connection):
-        SnowflakeCursor.__init__(self, connection, DictJsonResult)
+        SnowflakeCursor.__init__(self, connection, use_dict_result=True, json_result_class=DictJsonResult)
diff --git a/test/test_arrow_result.py b/test/test_arrow_result.py
@@ -8,6 +8,7 @@
 import random
 import pytest
 from datetime import datetime
+import snowflake.connector
 try:
     from snowflake.connector.arrow_iterator import PyArrowIterator
     no_arrow_iterator_ext = False
@@ -374,6 +375,26 @@ def test_select_with_large_resultset(conn_cnx):
     iterate_over_test_chunk("large_resultset", conn_cnx, sql_text, row_count, col_count)
 
 
+def test_dict_cursor(conn_cnx):
+    with conn_cnx() as cnx:
+        with cnx.cursor(snowflake.connector.DictCursor) as c:
+            c.execute("alter session set python_connector_query_result_format='ARROW'")
+
+            # first test small result generated by GS
+            ret = c.execute("select 1 as foo, 2 as bar").fetchone()
+            assert ret['FOO'] == 1
+            assert ret['BAR'] == 2
+
+            # test larger result set
+            row_index = 1
+            for row in c.execute("select row_number() over (order by val asc) as foo, "
+                                 "row_number() over (order by val asc) as bar "
+                                 "from (select seq4() as val from table(generator(rowcount=>10000)));"):
+                assert row['FOO'] == row_index
+                assert row['BAR'] == row_index
+                row_index += 1
+
+
 def get_random_seed():
     random.seed(datetime.now())
     return random.randint(0, 10000)
diff --git a/test/test_unit_arrow_chunk_iterator.py b/test/test_unit_arrow_chunk_iterator.py
@@ -538,7 +538,7 @@ def iterate_over_test_chunk(pyarrow_type, column_meta, source_data_generator, ex
     # seek stream to begnning so that we can read from stream
     stream.seek(0)
     context = ArrowConverterContext()
-    it = PyArrowIterator(stream, context)
+    it = PyArrowIterator(stream, context, False)
     it.init(ROW_UNIT)
 
     count = 0
diff --git a/test/test_unit_arrow_result.py b/test/test_unit_arrow_result.py