Skip to content

Commit 1f47e63

Browse files
sfc-gh-mkubiksfc-gh-fpawlowski
authored andcommitted
SNOW-1989239 - prevent silent failures on nano-arrow conversion (#2227)
Co-authored-by: Adam Ling <[email protected]> (cherry picked from commit 5e96035)
1 parent 35c1e66 commit 1f47e63

File tree

8 files changed

+73
-9
lines changed

8 files changed

+73
-9
lines changed

src/snowflake/connector/connection.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,10 @@ def _get_private_bytes_from_file(
319319
False,
320320
bool,
321321
), # use https://{bucket}.storage.googleapis.com instead of https://storage.googleapis.com/{bucket}
322+
"check_arrow_conversion_error_on_every_column": (
323+
True,
324+
bool,
325+
), # SNOW-XXXXX: remove the check_arrow_conversion_error_on_every_column flag
322326
"unsafe_file_write": (
323327
False,
324328
bool,
@@ -404,6 +408,7 @@ class SnowflakeConnection:
404408
token_file_path: The file path of the token file. If both token and token_file_path are provided, the token in token_file_path will be used.
405409
unsafe_file_write: When true, files downloaded by GET will be saved with 644 permissions. Otherwise, files will be saved with safe - owner-only permissions: 600.
406410
gcs_use_virtual_endpoints: When true, the virtual endpoint url is used, see: https://cloud.google.com/storage/docs/request-endpoints#xml-api
411+
check_arrow_conversion_error_on_every_column: When true, the error check after the conversion from arrow to python types will happen for every column in the row. This is a new behaviour which fixes the bug that caused the type errors to trigger silently when occurring at any place other than last column in a row. To revert the previous (faulty) behaviour, please set this flag to false.
407412
"""
408413

409414
OCSP_ENV_LOCK = Lock()
@@ -814,6 +819,14 @@ def gcs_use_virtual_endpoints(self) -> bool:
814819
def gcs_use_virtual_endpoints(self, value: bool) -> None:
815820
self._gcs_use_virtual_endpoints = value
816821

822+
@property
823+
def check_arrow_conversion_error_on_every_column(self) -> bool:
824+
return self._check_arrow_conversion_error_on_every_column
825+
826+
@check_arrow_conversion_error_on_every_column.setter
827+
def check_arrow_conversion_error_on_every_column(self, value: bool) -> bool:
828+
self._check_arrow_conversion_error_on_every_column = value
829+
817830
def connect(self, **kwargs) -> None:
818831
"""Establishes connection to Snowflake."""
819832
logger.debug("connect")

src/snowflake/connector/nanoarrow_cpp/ArrowIterator/CArrowChunkIterator.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@ namespace sf {
2727

2828
CArrowChunkIterator::CArrowChunkIterator(PyObject* context, char* arrow_bytes,
2929
int64_t arrow_bytes_size,
30-
PyObject* use_numpy)
30+
PyObject* use_numpy,
31+
PyObject* check_error_on_every_column)
3132
: CArrowIterator(arrow_bytes, arrow_bytes_size),
3233
m_latestReturnedRow(nullptr),
3334
m_context(context) {
@@ -39,6 +40,7 @@ CArrowChunkIterator::CArrowChunkIterator(PyObject* context, char* arrow_bytes,
3940
m_rowCountInBatch = 0;
4041
m_latestReturnedRow.reset();
4142
m_useNumpy = PyObject_IsTrue(use_numpy);
43+
m_checkErrorOnEveryColumn = PyObject_IsTrue(check_error_on_every_column);
4244

4345
m_batchCount = m_ipcArrowArrayVec.size();
4446
m_columnCount = m_batchCount > 0 ? m_ipcArrowSchema->n_children : 0;
@@ -92,6 +94,9 @@ void CArrowChunkIterator::createRowPyObject() {
9294
PyTuple_SET_ITEM(
9395
m_latestReturnedRow.get(), i,
9496
m_currentBatchConverters[i]->toPyObject(m_rowIndexInBatch));
97+
if (m_checkErrorOnEveryColumn && py::checkPyError()) {
98+
return;
99+
}
95100
}
96101
return;
97102
}
@@ -505,7 +510,8 @@ DictCArrowChunkIterator::DictCArrowChunkIterator(PyObject* context,
505510
char* arrow_bytes,
506511
int64_t arrow_bytes_size,
507512
PyObject* use_numpy)
508-
: CArrowChunkIterator(context, arrow_bytes, arrow_bytes_size, use_numpy) {}
513+
: CArrowChunkIterator(context, arrow_bytes, arrow_bytes_size, use_numpy,
514+
Py_False) {}
509515

510516
void DictCArrowChunkIterator::createRowPyObject() {
511517
m_latestReturnedRow.reset(PyDict_New());

src/snowflake/connector/nanoarrow_cpp/ArrowIterator/CArrowChunkIterator.hpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ class CArrowChunkIterator : public CArrowIterator {
3333
* Constructor
3434
*/
3535
CArrowChunkIterator(PyObject* context, char* arrow_bytes,
36-
int64_t arrow_bytes_size, PyObject* use_numpy);
36+
int64_t arrow_bytes_size, PyObject* use_numpy,
37+
PyObject* check_error_on_every_column);
3738

3839
/**
3940
* Destructor
@@ -78,6 +79,10 @@ class CArrowChunkIterator : public CArrowIterator {
7879
/** true if return numpy int64 float64 datetime*/
7980
bool m_useNumpy;
8081

82+
/** a flag that ensures running py::checkPyError after each column processing
83+
* in order to fail early on first python processing error */
84+
bool m_checkErrorOnEveryColumn;
85+
8186
void initColumnConverters();
8287
};
8388

src/snowflake/connector/nanoarrow_cpp/ArrowIterator/nanoarrow_arrow_iterator.pyx

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ cdef extern from "CArrowChunkIterator.hpp" namespace "sf":
5050
char* arrow_bytes,
5151
int64_t arrow_bytes_size,
5252
PyObject* use_numpy,
53+
PyObject* check_error_on_every_column,
5354
) except +
5455

5556
cdef cppclass DictCArrowChunkIterator(CArrowChunkIterator):
@@ -100,6 +101,7 @@ cdef class PyArrowIterator(EmptyPyArrowIterator):
100101
# still be converted into native python types.
101102
# https://docs.snowflake.com/en/user-guide/sqlalchemy.html#numpy-data-type-support
102103
cdef object use_numpy
104+
cdef object check_error_on_every_column
103105
cdef object number_to_decimal
104106
cdef object pyarrow_table
105107

@@ -111,12 +113,14 @@ cdef class PyArrowIterator(EmptyPyArrowIterator):
111113
object use_dict_result,
112114
object numpy,
113115
object number_to_decimal,
116+
object check_error_on_every_column
114117
):
115118
self.context = arrow_context
116119
self.cIterator = NULL
117120
self.use_dict_result = use_dict_result
118121
self.cursor = cursor
119122
self.use_numpy = numpy
123+
self.check_error_on_every_column = check_error_on_every_column
120124
self.number_to_decimal = number_to_decimal
121125
self.pyarrow_table = None
122126
self.table_returned = False
@@ -139,8 +143,9 @@ cdef class PyArrowRowIterator(PyArrowIterator):
139143
object use_dict_result,
140144
object numpy,
141145
object number_to_decimal,
146+
object check_error_on_every_column,
142147
):
143-
super().__init__(cursor, py_inputstream, arrow_context, use_dict_result, numpy, number_to_decimal)
148+
super().__init__(cursor, py_inputstream, arrow_context, use_dict_result, numpy, number_to_decimal, check_error_on_every_column)
144149
if self.cIterator is not NULL:
145150
return
146151

@@ -155,7 +160,8 @@ cdef class PyArrowRowIterator(PyArrowIterator):
155160
<PyObject *> self.context,
156161
self.arrow_bytes,
157162
self.arrow_bytes_size,
158-
<PyObject *> self.use_numpy
163+
<PyObject *> self.use_numpy,
164+
<PyObject *> self.check_error_on_every_column
159165
)
160166
cdef ReturnVal cret = self.cIterator.checkInitializationStatus()
161167
if cret.exception:
@@ -200,8 +206,9 @@ cdef class PyArrowTableIterator(PyArrowIterator):
200206
object use_dict_result,
201207
object numpy,
202208
object number_to_decimal,
209+
object check_error_on_every_column
203210
):
204-
super().__init__(cursor, py_inputstream, arrow_context, use_dict_result, numpy, number_to_decimal)
211+
super().__init__(cursor, py_inputstream, arrow_context, use_dict_result, numpy, number_to_decimal, check_error_on_every_column)
205212
if not INSTALLED_PYARROW:
206213
raise Error.errorhandler_make_exception(
207214
ProgrammingError,

src/snowflake/connector/result_batch.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ def _create_nanoarrow_iterator(
6262
numpy: bool,
6363
number_to_decimal: bool,
6464
row_unit: IterUnit,
65+
check_error_on_every_column: bool = True,
6566
):
6667
from .nanoarrow_arrow_iterator import PyArrowRowIterator, PyArrowTableIterator
6768

@@ -74,6 +75,7 @@ def _create_nanoarrow_iterator(
7475
use_dict_result,
7576
numpy,
7677
number_to_decimal,
78+
check_error_on_every_column,
7779
)
7880
if row_unit == IterUnit.ROW_UNIT
7981
else PyArrowTableIterator(
@@ -83,6 +85,7 @@ def _create_nanoarrow_iterator(
8385
use_dict_result,
8486
numpy,
8587
number_to_decimal,
88+
check_error_on_every_column,
8689
)
8790
)
8891

@@ -614,7 +617,7 @@ def _load(
614617
)
615618

616619
def _from_data(
617-
self, data: str, iter_unit: IterUnit
620+
self, data: str, iter_unit: IterUnit, check_error_on_every_column: bool = True
618621
) -> Iterator[dict | Exception] | Iterator[tuple | Exception]:
619622
"""Creates a ``PyArrowIterator`` files from a str.
620623
@@ -631,6 +634,7 @@ def _from_data(
631634
self._numpy,
632635
self._number_to_decimal,
633636
iter_unit,
637+
check_error_on_every_column,
634638
)
635639

636640
@classmethod
@@ -665,7 +669,15 @@ def _create_iter(
665669
"""Create an iterator for the ResultBatch. Used by get_arrow_iter."""
666670
if self._local:
667671
try:
668-
return self._from_data(self._data, iter_unit)
672+
return self._from_data(
673+
self._data,
674+
iter_unit,
675+
(
676+
connection.check_arrow_conversion_error_on_every_column
677+
if connection
678+
else None
679+
),
680+
)
669681
except Exception:
670682
if connection and getattr(connection, "_debug_arrow_chunk", False):
671683
logger.debug(f"arrow data can not be parsed: {self._data}")

test/helpers.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,7 @@ def create_nanoarrow_pyarrow_iterator(input_data, use_table_iterator):
175175
False,
176176
False,
177177
False,
178+
True,
178179
)
179180
if not use_table_iterator
180181
else NanoarrowPyArrowTableIterator(
@@ -186,6 +187,7 @@ def create_nanoarrow_pyarrow_iterator(input_data, use_table_iterator):
186187
False,
187188
False,
188189
False,
190+
False,
189191
)
190192
)
191193

test/integ/pandas/test_unit_arrow_chunk_iterator.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -430,7 +430,9 @@ def iterate_over_test_chunk(
430430
stream.seek(0)
431431
context = ArrowConverterContext()
432432

433-
it = NanoarrowPyArrowRowIterator(None, stream.read(), context, False, False, False)
433+
it = NanoarrowPyArrowRowIterator(
434+
None, stream.read(), context, False, False, False, True
435+
)
434436

435437
count = 0
436438
while True:

test/integ/test_cursor.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1790,6 +1790,23 @@ def test_out_of_range_year(conn_cnx, result_format, cursor_type, fetch_method):
17901790
fetch_next_fn()
17911791

17921792

1793+
@pytest.mark.parametrize("result_format", ("json", "arrow"))
1794+
def test_out_of_range_year_followed_by_correct_year(conn_cnx, result_format):
1795+
"""Tests whether the year 10000 is out of range exception is raised as expected."""
1796+
with conn_cnx(
1797+
session_parameters={
1798+
PARAMETER_PYTHON_CONNECTOR_QUERY_RESULT_FORMAT: result_format
1799+
}
1800+
) as con:
1801+
with con.cursor() as cur:
1802+
cur.execute("select TO_DATE('10000-01-01'), TO_DATE('9999-01-01')")
1803+
with pytest.raises(
1804+
InterfaceError,
1805+
match="out of range",
1806+
):
1807+
cur.fetchall()
1808+
1809+
17931810
@pytest.mark.skipolddriver
17941811
def test_describe(conn_cnx):
17951812
with conn_cnx() as con:

0 commit comments

Comments
 (0)