Skip to content

Commit 6a2a5b6

Browse files
authored
SNOW-1000283: Add support for Structured Types. (#1853)
1 parent 3cced62 commit 6a2a5b6

File tree

13 files changed

+513
-6
lines changed

13 files changed

+513
-6
lines changed

DESCRIPTION.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ Source code is also available at: https://github.com/snowflakedb/snowflake-conne
1717
- Fixed PyArrow Table type hinting
1818
- Added support for connecting using an existing connection via the session and master token.
1919
- Added support for connecting to Snowflake by authenticating with multiple SAML IDP using external browser.
20+
- Added support for structured types (OBJECT, MAP, ARRAY) to nanoarrow converters.
2021
- Fixed compilation issue due to missing cstdint header on gcc13.
2122
- Improved config permissions warning message.
2223

setup.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ def build_extension(self, ext):
9494
*((file,) if isinstance(file, str) else file)
9595
)
9696
for file in {
97+
"ArrayConverter.cpp",
9798
"BinaryConverter.cpp",
9899
"BooleanConverter.cpp",
99100
"CArrowChunkIterator.cpp",
@@ -104,6 +105,8 @@ def build_extension(self, ext):
104105
"FixedSizeListConverter.cpp",
105106
"FloatConverter.cpp",
106107
"IntConverter.cpp",
108+
"MapConverter.cpp",
109+
"ObjectConverter.cpp",
107110
"SnowflakeType.cpp",
108111
"StringConverter.cpp",
109112
"TimeConverter.cpp",
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
//
2+
// Copyright (c) 2012-2023 Snowflake Computing Inc. All rights reserved.
3+
//
4+
5+
#include "ArrayConverter.hpp"
6+
7+
#include <memory>
8+
9+
#include "CArrowChunkIterator.hpp"
10+
#include "CArrowIterator.hpp"
11+
#include "SnowflakeType.hpp"
12+
13+
namespace sf {
14+
Logger* ArrayConverter::logger =
15+
new Logger("snowflake.connector.ArrayConverter");
16+
17+
void ArrayConverter::generateError(const std::string& msg) const {
18+
logger->error(__FILE__, __func__, __LINE__, msg.c_str());
19+
PyErr_SetString(PyExc_Exception, msg.c_str());
20+
}
21+
22+
ArrayConverter::ArrayConverter(ArrowSchemaView* schemaView,
23+
ArrowArrayView* array, PyObject* context,
24+
bool useNumpy) {
25+
m_array = array;
26+
27+
if (schemaView->schema->n_children != 1) {
28+
std::string errorInfo = Logger::formatString(
29+
"[Snowflake Exception] invalid arrow schema for array items expected 1 "
30+
"schema child, but got %d",
31+
schemaView->schema->n_children);
32+
this->generateError(errorInfo);
33+
return;
34+
}
35+
36+
ArrowSchema* item_schema = schemaView->schema->children[0];
37+
ArrowArrayView* item_array = array->children[0];
38+
m_item_converter = getConverterFromSchema(item_schema, item_array, context,
39+
useNumpy, logger);
40+
}
41+
42+
PyObject* ArrayConverter::toPyObject(int64_t rowIndex) const {
43+
if (ArrowArrayViewIsNull(m_array, rowIndex)) {
44+
Py_RETURN_NONE;
45+
}
46+
47+
// Array item offsets are stored in the second array buffers
48+
// Infer start an end of this rows slice by looking at the
49+
// current and next offset. If there isn't another offset use
50+
// the end of the array instead.
51+
int start = m_array->buffer_views[1].data.as_int32[rowIndex];
52+
int end = m_array->children[0]->length;
53+
if (rowIndex + 1 < m_array->length) {
54+
end = m_array->buffer_views[1].data.as_int32[rowIndex + 1];
55+
}
56+
57+
PyObject* list = PyList_New(end - start);
58+
for (int i = start; i < end; i++) {
59+
PyList_SetItem(list, i - start, m_item_converter->toPyObject(i));
60+
}
61+
return list;
62+
}
63+
64+
} // namespace sf
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
//
2+
// Copyright (c) 2012-2023 Snowflake Computing Inc. All rights reserved.
3+
//
4+
5+
#ifndef PC_ARRAYCONVERTER_HPP
6+
#define PC_ARRAYCONVERTER_HPP
7+
8+
#include <memory>
9+
10+
#include "IColumnConverter.hpp"
11+
#include "logging.hpp"
12+
#include "nanoarrow.h"
13+
#include "nanoarrow.hpp"
14+
15+
namespace sf {
16+
17+
class ArrayConverter : public IColumnConverter {
18+
public:
19+
explicit ArrayConverter(ArrowSchemaView* schemaView, ArrowArrayView* array,
20+
PyObject* context, bool useNumpy);
21+
22+
PyObject* toPyObject(int64_t rowIndex) const override;
23+
24+
private:
25+
void generateError(const std::string& msg) const;
26+
27+
ArrowArrayView* m_array;
28+
std::shared_ptr<sf::IColumnConverter> m_item_converter;
29+
static Logger* logger;
30+
};
31+
32+
} // namespace sf
33+
#endif // PC_ARRAYCONVERTER_HPP

src/snowflake/connector/nanoarrow_cpp/ArrowIterator/CArrowChunkIterator.cpp

Lines changed: 62 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,16 @@
88
#include <string>
99
#include <vector>
1010

11+
#include "ArrayConverter.hpp"
1112
#include "BinaryConverter.hpp"
1213
#include "BooleanConverter.hpp"
1314
#include "DateConverter.hpp"
1415
#include "DecimalConverter.hpp"
1516
#include "FixedSizeListConverter.hpp"
1617
#include "FloatConverter.hpp"
1718
#include "IntConverter.hpp"
19+
#include "MapConverter.hpp"
20+
#include "ObjectConverter.hpp"
1821
#include "StringConverter.hpp"
1922
#include "TimeConverter.hpp"
2023
#include "TimeStampConverter.hpp"
@@ -193,9 +196,7 @@ std::shared_ptr<sf::IColumnConverter> getConverterFromSchema(
193196
}
194197

195198
case SnowflakeType::Type::ANY:
196-
case SnowflakeType::Type::ARRAY:
197199
case SnowflakeType::Type::CHAR:
198-
case SnowflakeType::Type::OBJECT:
199200
case SnowflakeType::Type::TEXT:
200201
case SnowflakeType::Type::VARIANT: {
201202
converter = std::make_shared<sf::StringConverter>(array);
@@ -378,8 +379,13 @@ std::shared_ptr<sf::IColumnConverter> getConverterFromSchema(
378379
returnCode);
379380
scale =
380381
std::stoi(std::string(scaleString.data, scaleString.size_bytes));
381-
byteLength = std::stoi(
382-
std::string(byteLengthString.data, byteLengthString.size_bytes));
382+
383+
// Byte Length may be unset if TIMESTAMP_TZ is the child of a structured
384+
// type In this case rely on the default value.
385+
if (byteLengthString.data != nullptr) {
386+
byteLength = std::stoi(
387+
std::string(byteLengthString.data, byteLengthString.size_bytes));
388+
}
383389
}
384390
switch (byteLength) {
385391
case 8: {
@@ -408,6 +414,58 @@ std::shared_ptr<sf::IColumnConverter> getConverterFromSchema(
408414
break;
409415
}
410416

417+
case SnowflakeType::Type::ARRAY: {
418+
switch (schemaView.type) {
419+
case NANOARROW_TYPE_STRING:
420+
converter = std::make_shared<sf::StringConverter>(array);
421+
break;
422+
case NANOARROW_TYPE_LIST:
423+
converter = std::make_shared<sf::ArrayConverter>(&schemaView, array,
424+
context, useNumpy);
425+
break;
426+
default: {
427+
std::string errorInfo = Logger::formatString(
428+
"[Snowflake Exception] unknown arrow internal data type(%d) "
429+
"for ARRAY data in %s",
430+
NANOARROW_TYPE_ENUM_STRING[schemaView.type],
431+
schemaView.schema->name);
432+
logger->error(__FILE__, __func__, __LINE__, errorInfo.c_str());
433+
PyErr_SetString(PyExc_Exception, errorInfo.c_str());
434+
break;
435+
}
436+
}
437+
break;
438+
}
439+
440+
case SnowflakeType::Type::MAP: {
441+
converter = std::make_shared<sf::MapConverter>(&schemaView, array,
442+
context, useNumpy);
443+
break;
444+
}
445+
446+
case SnowflakeType::Type::OBJECT: {
447+
switch (schemaView.type) {
448+
case NANOARROW_TYPE_STRING:
449+
converter = std::make_shared<sf::StringConverter>(array);
450+
break;
451+
case NANOARROW_TYPE_STRUCT:
452+
converter = std::make_shared<sf::ObjectConverter>(&schemaView, array,
453+
context, useNumpy);
454+
break;
455+
default: {
456+
std::string errorInfo = Logger::formatString(
457+
"[Snowflake Exception] unknown arrow internal data type(%d) "
458+
"for OBJECT data in %s",
459+
NANOARROW_TYPE_ENUM_STRING[schemaView.type],
460+
schemaView.schema->name);
461+
logger->error(__FILE__, __func__, __LINE__, errorInfo.c_str());
462+
PyErr_SetString(PyExc_Exception, errorInfo.c_str());
463+
break;
464+
}
465+
}
466+
break;
467+
}
468+
411469
case SnowflakeType::Type::VECTOR: {
412470
converter = std::make_shared<sf::FixedSizeListConverter>(array);
413471
break;

src/snowflake/connector/nanoarrow_cpp/ArrowIterator/CArrowTableIterator.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ void CArrowTableIterator::reconstructRecordBatches_nanoarrow() {
112112
case SnowflakeType::Type::BOOLEAN:
113113
case SnowflakeType::Type::CHAR:
114114
case SnowflakeType::Type::DATE:
115+
case SnowflakeType::Type::MAP:
115116
case SnowflakeType::Type::OBJECT:
116117
case SnowflakeType::Type::REAL:
117118
case SnowflakeType::Type::TEXT:
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
//
2+
// Copyright (c) 2012-2023 Snowflake Computing Inc. All rights reserved.
3+
//
4+
5+
#include "MapConverter.hpp"
6+
7+
#include <memory>
8+
9+
#include "CArrowChunkIterator.hpp"
10+
#include "CArrowIterator.hpp"
11+
#include "SnowflakeType.hpp"
12+
13+
namespace sf {
14+
Logger* MapConverter::logger = new Logger("snowflake.connector.MapConverter");
15+
16+
void MapConverter::generateError(const std::string& msg) const {
17+
logger->error(__FILE__, __func__, __LINE__, msg.c_str());
18+
PyErr_SetString(PyExc_Exception, msg.c_str());
19+
}
20+
21+
MapConverter::MapConverter(ArrowSchemaView* schemaView, ArrowArrayView* array,
22+
PyObject* context, bool useNumpy) {
23+
m_array = array;
24+
25+
if (schemaView->schema->n_children != 1) {
26+
std::string errorInfo = Logger::formatString(
27+
"[Snowflake Exception] invalid arrow schema for map entries expected 1 "
28+
"schema child, but got %d",
29+
schemaView->schema->n_children);
30+
this->generateError(errorInfo);
31+
return;
32+
}
33+
34+
ArrowSchema* entries = schemaView->schema->children[0];
35+
36+
if (entries->n_children != 2) {
37+
std::string errorInfo = Logger::formatString(
38+
"[Snowflake Exception] invalid arrow schema for map key/value pair "
39+
"expected 2 entries, but got %d",
40+
entries->n_children);
41+
this->generateError(errorInfo);
42+
return;
43+
}
44+
45+
ArrowSchema* key_schema = entries->children[0];
46+
ArrowArrayView* key_array = array->children[0]->children[0];
47+
m_key_converter =
48+
getConverterFromSchema(key_schema, key_array, context, useNumpy, logger);
49+
50+
ArrowSchema* value_schema = entries->children[1];
51+
ArrowArrayView* value_array = array->children[0]->children[1];
52+
m_value_converter = getConverterFromSchema(value_schema, value_array, context,
53+
useNumpy, logger);
54+
}
55+
56+
PyObject* MapConverter::toPyObject(int64_t rowIndex) const {
57+
if (ArrowArrayViewIsNull(m_array, rowIndex)) {
58+
Py_RETURN_NONE;
59+
}
60+
61+
// Map ArrowArrays have two child Arrays that contain the the keys and values.
62+
// The offsets for how many items belong to each row are stored in the parent
63+
// array offset buffer. The start and end of a row slice has to be infered
64+
// from the offsets for each row.
65+
int start = m_array->buffer_views[1].data.as_int32[rowIndex];
66+
int end = m_array->children[0]->length;
67+
if (rowIndex + 1 < m_array->length) {
68+
end = m_array->buffer_views[1].data.as_int32[rowIndex + 1];
69+
}
70+
71+
PyObject* dict = PyDict_New();
72+
for (int i = start; i < end; i++) {
73+
PyDict_SetItem(dict, m_key_converter->toPyObject(i),
74+
m_value_converter->toPyObject(i));
75+
}
76+
return dict;
77+
}
78+
79+
} // namespace sf
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
//
2+
// Copyright (c) 2012-2023 Snowflake Computing Inc. All rights reserved.
3+
//
4+
5+
#ifndef PC_MAPCONVERTER_HPP
6+
#define PC_MAPCONVERTER_HPP
7+
8+
#include <memory>
9+
10+
#include "IColumnConverter.hpp"
11+
#include "logging.hpp"
12+
#include "nanoarrow.h"
13+
#include "nanoarrow.hpp"
14+
15+
namespace sf {
16+
17+
class MapConverter : public IColumnConverter {
18+
public:
19+
explicit MapConverter(ArrowSchemaView* schemaView, ArrowArrayView* array,
20+
PyObject* context, bool useNumpy);
21+
22+
PyObject* toPyObject(int64_t rowIndex) const override;
23+
24+
private:
25+
void generateError(const std::string& msg) const;
26+
27+
ArrowArrayView* m_array;
28+
std::shared_ptr<sf::IColumnConverter> m_key_converter;
29+
std::shared_ptr<sf::IColumnConverter> m_value_converter;
30+
static Logger* logger;
31+
};
32+
33+
} // namespace sf
34+
#endif // PC_MAPCONVERTER_HPP
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
//
2+
// Copyright (c) 2012-2023 Snowflake Computing Inc. All rights reserved.
3+
//
4+
5+
#include "ObjectConverter.hpp"
6+
7+
#include <memory>
8+
9+
#include "CArrowChunkIterator.hpp"
10+
#include "CArrowIterator.hpp"
11+
#include "SnowflakeType.hpp"
12+
13+
namespace sf {
14+
Logger* ObjectConverter::logger =
15+
new Logger("snowflake.connector.BinaryConverter");
16+
17+
ObjectConverter::ObjectConverter(ArrowSchemaView* schemaView,
18+
ArrowArrayView* array, PyObject* context,
19+
bool useNumpy) {
20+
m_array = array;
21+
m_converters.clear();
22+
m_property_names.clear();
23+
m_propertyCount = schemaView->schema->n_children;
24+
25+
for (int i = 0; i < schemaView->schema->n_children; i++) {
26+
ArrowSchema* property_schema = schemaView->schema->children[i];
27+
28+
m_property_names.push_back(property_schema->name);
29+
30+
ArrowArrayView* child_array = array->children[i];
31+
32+
m_converters.push_back(getConverterFromSchema(property_schema, child_array,
33+
context, useNumpy, logger));
34+
}
35+
}
36+
37+
PyObject* ObjectConverter::toPyObject(int64_t rowIndex) const {
38+
if (ArrowArrayViewIsNull(m_array, rowIndex)) {
39+
Py_RETURN_NONE;
40+
}
41+
42+
PyObject* dict = PyDict_New();
43+
for (int i = 0; i < m_propertyCount; i++) {
44+
PyDict_SetItemString(dict, m_property_names[i],
45+
m_converters[i]->toPyObject(rowIndex));
46+
}
47+
return dict;
48+
}
49+
50+
} // namespace sf

0 commit comments

Comments
 (0)