Skip to content

Commit 31608d8

Browse files
sfc-gh-nkumarsfc-gh-fpawlowski
authored andcommitted
SNOW-2052629: Add basic arrow support for Interval data types (#2296)
(cherry picked from commit bbc2f80)
1 parent 6e4debf commit 31608d8

File tree

9 files changed

+297
-1
lines changed

9 files changed

+297
-1
lines changed

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ def build_extension(self, ext):
103103
"FixedSizeListConverter.cpp",
104104
"FloatConverter.cpp",
105105
"IntConverter.cpp",
106+
"IntervalConverter.cpp",
106107
"MapConverter.cpp",
107108
"ObjectConverter.cpp",
108109
"SnowflakeType.cpp",

src/snowflake/connector/arrow_context.py

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from .converter import _generate_tzinfo_from_tzoffset
1616

1717
if TYPE_CHECKING:
18-
from numpy import datetime64, float64, int64
18+
from numpy import datetime64, float64, int64, timedelta64
1919

2020

2121
try:
@@ -163,3 +163,41 @@ def DECFLOAT_to_decimal(self, exponent: int, significand: bytes) -> decimal.Deci
163163

164164
def DECFLOAT_to_numpy_float64(self, exponent: int, significand: bytes) -> float64:
165165
return numpy.float64(self.DECFLOAT_to_decimal(exponent, significand))
166+
167+
def INTERVAL_YEAR_MONTH_to_numpy_timedelta(self, months: int) -> timedelta64:
168+
return numpy.timedelta64(months, "M")
169+
170+
def INTERVAL_DAY_TIME_int_to_numpy_timedelta(self, nanos: int) -> timedelta64:
171+
return numpy.timedelta64(nanos, "ns")
172+
173+
def INTERVAL_DAY_TIME_int_to_timedelta(self, nanos: int) -> timedelta:
174+
# Python timedelta only supports microsecond precision. We receive value in
175+
# nanoseconds.
176+
return timedelta(microseconds=nanos // 1000)
177+
178+
def INTERVAL_DAY_TIME_decimal_to_numpy_timedelta(self, value: bytes) -> timedelta64:
179+
# Snowflake supports up to 9 digits leading field precision for the day-time
180+
# interval. That when represented in nanoseconds can not be stored in a 64-bit
181+
# integer. So we send these as Decimal128 from server to client.
182+
# Arrow uses little-endian by default.
183+
# https://arrow.apache.org/docs/format/Columnar.html#byte-order-endianness
184+
nanos = int.from_bytes(value, byteorder="little", signed=True)
185+
# Numpy timedelta only supports up to 64-bit integers, so we need to change the
186+
# unit to milliseconds to avoid overflow.
187+
# Max value received from server
188+
# = 10**9 * NANOS_PER_DAY - 1
189+
# = 86399999999999999999999 nanoseconds
190+
# = 86399999999999999 milliseconds
191+
# math.log2(86399999999999999) = 56.3 < 64
192+
return numpy.timedelta64(nanos // 1_000_000, "ms")
193+
194+
def INTERVAL_DAY_TIME_decimal_to_timedelta(self, value: bytes) -> timedelta:
195+
# Snowflake supports up to 9 digits leading field precision for the day-time
196+
# interval. That when represented in nanoseconds can not be stored in a 64-bit
197+
# integer. So we send these as Decimal128 from server to client.
198+
# Arrow uses little-endian by default.
199+
# https://arrow.apache.org/docs/format/Columnar.html#byte-order-endianness
200+
nanos = int.from_bytes(value, byteorder="little", signed=True)
201+
# Python timedelta only supports microsecond precision. We receive value in
202+
# nanoseconds.
203+
return timedelta(microseconds=nanos // 1000)

src/snowflake/connector/nanoarrow_cpp/ArrowIterator/CArrowChunkIterator.cpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "FixedSizeListConverter.hpp"
1414
#include "FloatConverter.hpp"
1515
#include "IntConverter.hpp"
16+
#include "IntervalConverter.hpp"
1617
#include "MapConverter.hpp"
1718
#include "ObjectConverter.hpp"
1819
#include "StringConverter.hpp"
@@ -479,6 +480,36 @@ std::shared_ptr<sf::IColumnConverter> getConverterFromSchema(
479480
break;
480481
}
481482

483+
case SnowflakeType::Type::INTERVAL_YEAR_MONTH: {
484+
converter = std::make_shared<sf::IntervalYearMonthConverter>(
485+
array, context, useNumpy);
486+
break;
487+
}
488+
489+
case SnowflakeType::Type::INTERVAL_DAY_TIME: {
490+
switch (schemaView.type) {
491+
case NANOARROW_TYPE_INT64:
492+
converter = std::make_shared<sf::IntervalDayTimeConverterInt>(
493+
array, context, useNumpy);
494+
break;
495+
case NANOARROW_TYPE_DECIMAL128:
496+
converter = std::make_shared<sf::IntervalDayTimeConverterDecimal>(
497+
array, context, useNumpy);
498+
break;
499+
default: {
500+
std::string errorInfo = Logger::formatString(
501+
"[Snowflake Exception] unknown arrow internal data type(%d) "
502+
"for OBJECT data in %s",
503+
NANOARROW_TYPE_ENUM_STRING[schemaView.type],
504+
schemaView.schema->name);
505+
logger->error(__FILE__, __func__, __LINE__, errorInfo.c_str());
506+
PyErr_SetString(PyExc_Exception, errorInfo.c_str());
507+
break;
508+
}
509+
}
510+
break;
511+
}
512+
482513
default: {
483514
std::string errorInfo = Logger::formatString(
484515
"[Snowflake Exception] unknown snowflake data type : %d", st);
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
#include "IntervalConverter.hpp"
2+
3+
#include <memory>
4+
#include <string>
5+
6+
#include "Python/Common.hpp"
7+
#include "Python/Helpers.hpp"
8+
9+
namespace sf {
10+
11+
static constexpr char INTERVAL_DT_DECIMAL_TO_NUMPY_TIMEDELTA[] =
12+
"INTERVAL_DAY_TIME_decimal_to_numpy_timedelta";
13+
static constexpr char INTERVAL_DT_DECIMAL_TO_TIMEDELTA[] =
14+
"INTERVAL_DAY_TIME_decimal_to_timedelta";
15+
static constexpr char INTERVAL_DT_INT_TO_NUMPY_TIMEDELTA[] =
16+
"INTERVAL_DAY_TIME_int_to_numpy_timedelta";
17+
static constexpr char INTERVAL_DT_INT_TO_TIMEDELTA[] =
18+
"INTERVAL_DAY_TIME_int_to_timedelta";
19+
20+
IntervalYearMonthConverter::IntervalYearMonthConverter(ArrowArrayView* array,
21+
PyObject* context,
22+
bool useNumpy)
23+
: m_array(array), m_context(context), m_useNumpy(useNumpy) {}
24+
25+
PyObject* IntervalYearMonthConverter::toPyObject(int64_t rowIndex) const {
26+
if (ArrowArrayViewIsNull(m_array, rowIndex)) {
27+
Py_RETURN_NONE;
28+
}
29+
int64_t val = ArrowArrayViewGetIntUnsafe(m_array, rowIndex);
30+
if (m_useNumpy) {
31+
return PyObject_CallMethod(
32+
m_context, "INTERVAL_YEAR_MONTH_to_numpy_timedelta", "L", val);
33+
}
34+
// Python timedelta does not support year-month intervals. Use long instead.
35+
return PyLong_FromLongLong(val);
36+
}
37+
38+
IntervalDayTimeConverterInt::IntervalDayTimeConverterInt(ArrowArrayView* array,
39+
PyObject* context,
40+
bool useNumpy)
41+
: m_array(array), m_context(context) {
42+
m_method = useNumpy ? INTERVAL_DT_INT_TO_NUMPY_TIMEDELTA
43+
: INTERVAL_DT_INT_TO_TIMEDELTA;
44+
}
45+
46+
PyObject* IntervalDayTimeConverterInt::toPyObject(int64_t rowIndex) const {
47+
if (ArrowArrayViewIsNull(m_array, rowIndex)) {
48+
Py_RETURN_NONE;
49+
}
50+
int64_t val = ArrowArrayViewGetIntUnsafe(m_array, rowIndex);
51+
return PyObject_CallMethod(m_context, m_method, "L", val);
52+
}
53+
54+
IntervalDayTimeConverterDecimal::IntervalDayTimeConverterDecimal(
55+
ArrowArrayView* array, PyObject* context, bool useNumpy)
56+
: m_array(array), m_context(context) {
57+
m_method = useNumpy ? INTERVAL_DT_DECIMAL_TO_NUMPY_TIMEDELTA
58+
: INTERVAL_DT_DECIMAL_TO_TIMEDELTA;
59+
}
60+
61+
PyObject* IntervalDayTimeConverterDecimal::toPyObject(int64_t rowIndex) const {
62+
if (ArrowArrayViewIsNull(m_array, rowIndex)) {
63+
Py_RETURN_NONE;
64+
}
65+
int64_t bytes_start = 16 * (m_array->array->offset + rowIndex);
66+
const char* ptr_start = m_array->buffer_views[1].data.as_char;
67+
PyObject* int128_bytes =
68+
PyBytes_FromStringAndSize(&(ptr_start[bytes_start]), 16);
69+
return PyObject_CallMethod(m_context, m_method, "S", int128_bytes);
70+
}
71+
} // namespace sf
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
#ifndef PC_INTERVALCONVERTER_HPP
2+
#define PC_INTERVALCONVERTER_HPP
3+
4+
#include <memory>
5+
6+
#include "IColumnConverter.hpp"
7+
#include "nanoarrow.h"
8+
#include "nanoarrow.hpp"
9+
10+
namespace sf {
11+
12+
class IntervalYearMonthConverter : public IColumnConverter {
13+
public:
14+
explicit IntervalYearMonthConverter(ArrowArrayView* array, PyObject* context,
15+
bool useNumpy);
16+
virtual ~IntervalYearMonthConverter() = default;
17+
18+
PyObject* toPyObject(int64_t rowIndex) const override;
19+
20+
private:
21+
ArrowArrayView* m_array;
22+
PyObject* m_context;
23+
bool m_useNumpy;
24+
};
25+
26+
class IntervalDayTimeConverterInt : public IColumnConverter {
27+
public:
28+
explicit IntervalDayTimeConverterInt(ArrowArrayView* array, PyObject* context,
29+
bool useNumpy);
30+
virtual ~IntervalDayTimeConverterInt() = default;
31+
32+
PyObject* toPyObject(int64_t rowIndex) const override;
33+
34+
private:
35+
ArrowArrayView* m_array;
36+
PyObject* m_context;
37+
const char* m_method;
38+
};
39+
40+
class IntervalDayTimeConverterDecimal : public IColumnConverter {
41+
public:
42+
explicit IntervalDayTimeConverterDecimal(ArrowArrayView* array,
43+
PyObject* context, bool useNumpy);
44+
virtual ~IntervalDayTimeConverterDecimal() = default;
45+
46+
PyObject* toPyObject(int64_t rowIndex) const override;
47+
48+
private:
49+
ArrowArrayView* m_array;
50+
PyObject* m_context;
51+
const char* m_method;
52+
};
53+
54+
} // namespace sf
55+
56+
#endif // PC_INTERVALCONVERTER_HPP

src/snowflake/connector/nanoarrow_cpp/ArrowIterator/SnowflakeType.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ std::unordered_map<std::string, SnowflakeType::Type>
1515
{"FIXED", SnowflakeType::Type::FIXED},
1616
{"DECFLOAT", SnowflakeType::Type::DECFLOAT},
1717
{"FLOAT", SnowflakeType::Type::REAL},
18+
{"INTERVAL_YEAR_MONTH", SnowflakeType::Type::INTERVAL_YEAR_MONTH},
19+
{"INTERVAL_DAY_TIME", SnowflakeType::Type::INTERVAL_DAY_TIME},
1820
{"MAP", SnowflakeType::Type::MAP},
1921
{"OBJECT", SnowflakeType::Type::OBJECT},
2022
{"REAL", SnowflakeType::Type::REAL},

src/snowflake/connector/nanoarrow_cpp/ArrowIterator/SnowflakeType.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ class SnowflakeType {
3030
VECTOR = 16,
3131
MAP = 17,
3232
DECFLOAT = 18,
33+
INTERVAL_YEAR_MONTH = 19,
34+
INTERVAL_DAY_TIME = 20,
3335
};
3436

3537
static SnowflakeType::Type snowflakeTypeFromString(std::string str) {

test/integ/test_arrow_result.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1223,6 +1223,65 @@ def test_fetch_as_numpy_val(conn_cnx):
12231223
assert val[3] == numpy.datetime64("2019-01-02 12:34:56.12345678")
12241224

12251225

1226+
@pytest.mark.parametrize("use_numpy", [True, False])
1227+
def test_select_year_month_interval_arrow(conn_cnx, use_numpy):
1228+
cases = ["0-0", "1-2", "-1-3", "999999999-11", "-999999999-11"]
1229+
expected = [0, 14, -15, 11_999_999_999, -11_999_999_999]
1230+
if use_numpy:
1231+
expected = [numpy.timedelta64(e, "M") for e in expected]
1232+
1233+
table = "test_arrow_day_time_interval"
1234+
values = "(" + "),(".join([f"'{c}'" for c in cases]) + ")"
1235+
with conn_cnx(numpy=use_numpy) as conn:
1236+
cursor = conn.cursor()
1237+
cursor.execute("alter session set python_connector_query_result_format='arrow'")
1238+
1239+
cursor.execute("alter session set feature_interval_types=enabled")
1240+
cursor.execute(f"create or replace table {table} (c1 interval year to month)")
1241+
cursor.execute(f"insert into {table} values {values}")
1242+
result = conn.cursor().execute(f"select * from {table}").fetchall()
1243+
result = [r[0] for r in result]
1244+
assert result == expected
1245+
1246+
1247+
@pytest.mark.skip(
1248+
reason="SNOW-1878635: Add support for day-time interval in ArrowStreamWriter"
1249+
)
1250+
@pytest.mark.parametrize("use_numpy", [True, False])
1251+
def test_select_day_time_interval_arrow(conn_cnx, use_numpy):
1252+
cases = [
1253+
"0 0:0:0.0",
1254+
"12 3:4:5.678",
1255+
"-1 2:3:4.567",
1256+
"99999 23:59:59.999999",
1257+
"-99999 23:59:59.999999",
1258+
]
1259+
expected = [
1260+
timedelta(days=0),
1261+
timedelta(days=12, hours=3, minutes=4, seconds=5.678),
1262+
-timedelta(days=1, hours=2, minutes=3, seconds=4.567),
1263+
timedelta(days=99999, hours=23, minutes=59, seconds=59.999999),
1264+
-timedelta(days=99999, hours=23, minutes=59, seconds=59.999999),
1265+
]
1266+
if use_numpy:
1267+
expected = [numpy.timedelta64(e) for e in expected]
1268+
1269+
table = "test_arrow_day_time_interval"
1270+
values = "(" + "),(".join([f"'{c}'" for c in cases]) + ")"
1271+
with conn_cnx(numpy=use_numpy) as conn:
1272+
cursor = conn.cursor()
1273+
cursor.execute("alter session set python_connector_query_result_format='arrow'")
1274+
1275+
cursor.execute("alter session set feature_interval_types=enabled")
1276+
cursor.execute(
1277+
f"create or replace table {table} (c1 interval day(5) to second)"
1278+
)
1279+
cursor.execute(f"insert into {table} values {values}")
1280+
result = conn.cursor().execute(f"select * from {table}").fetchall()
1281+
result = [r[0] for r in result]
1282+
assert result == expected
1283+
1284+
12261285
def get_random_seed():
12271286
random.seed(datetime.now().timestamp())
12281287
return random.randint(0, 10000)

test/unit/test_converter.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
#!/usr/bin/env python
22
from __future__ import annotations
33

4+
from datetime import timedelta
45
from decimal import Decimal
56
from logging import getLogger
67

8+
import numpy
79
import pytest
810

911
from snowflake.connector import ProgrammingError
@@ -97,3 +99,37 @@ def test_converter_to_snowflake_bindings_error():
9799
match=r"Binding data in type \(somethingsomething\) is not supported",
98100
):
99101
converter._somethingsomething_to_snowflake_bindings("Bogus")
102+
103+
104+
NANOS_PER_DAY = 24 * 60 * 60 * 10**9
105+
106+
107+
@pytest.mark.parametrize("nanos", [0, 1, 999, 1000, 999999, 10**5 * NANOS_PER_DAY - 1])
108+
def test_day_time_interval_int_to_timedelta(nanos):
109+
converter = ArrowConverterContext()
110+
assert converter.INTERVAL_DAY_TIME_int_to_timedelta(nanos) == timedelta(
111+
microseconds=nanos // 1000
112+
)
113+
assert converter.INTERVAL_DAY_TIME_int_to_numpy_timedelta(
114+
nanos
115+
) == numpy.timedelta64(nanos, "ns")
116+
117+
118+
@pytest.mark.parametrize("nanos", [0, 1, 999, 1000, 999999, 10**9 * NANOS_PER_DAY - 1])
119+
def test_day_time_interval_decimal_to_timedelta(nanos):
120+
converter = ArrowConverterContext()
121+
nano_bytes = nanos.to_bytes(16, byteorder="little", signed=True)
122+
assert converter.INTERVAL_DAY_TIME_decimal_to_timedelta(nano_bytes) == timedelta(
123+
microseconds=nanos // 1000
124+
)
125+
assert converter.INTERVAL_DAY_TIME_decimal_to_numpy_timedelta(
126+
nano_bytes
127+
) == numpy.timedelta64(nanos // 1_000_000, "ms")
128+
129+
130+
@pytest.mark.parametrize("months", [0, 1, 999, 1000, 999999, 10**9 * 12 - 1])
131+
def test_year_month_interval_to_timedelta(months):
132+
converter = ArrowConverterContext()
133+
assert converter.INTERVAL_YEAR_MONTH_to_numpy_timedelta(
134+
months
135+
) == numpy.timedelta64(months, "M")

0 commit comments

Comments
 (0)