Skip to content

Commit de93ef4

Browse files
authored
SNOW-920892: v3.3.0 nanoarrow based connector (#1750)
1 parent 246eb8f commit de93ef4

File tree

105 files changed

+44863
-47
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

105 files changed

+44863
-47
lines changed

.github/workflows/build_test.yml

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,68 @@ jobs:
296296
.coverage.py${{ env.shortver }}-lambda-ci
297297
junit.py${{ env.shortver }}-lambda-ci-dev.xml
298298
299+
test-vendoredarrow:
300+
name: Test Vendored Arrow ${{ matrix.os.download_name }}-${{ matrix.python-version }}-${{ matrix.cloud-provider }}
301+
needs: build
302+
runs-on: ${{ matrix.os.image_name }}
303+
strategy:
304+
fail-fast: false
305+
matrix:
306+
os:
307+
- image_name: ubuntu-latest
308+
download_name: manylinux_x86_64
309+
- image_name: macos-latest
310+
download_name: macosx_x86_64
311+
- image_name: windows-2019
312+
download_name: win_amd64
313+
python-version: ["3.8", "3.11"]
314+
cloud-provider: [aws]
315+
steps:
316+
- uses: actions/checkout@v3
317+
- name: Set up Python
318+
uses: actions/setup-python@v4
319+
with:
320+
python-version: ${{ matrix.python-version }}
321+
- name: Display Python version
322+
run: python -c "import sys; print(sys.version)"
323+
- name: Setup parameters file
324+
shell: bash
325+
env:
326+
PARAMETERS_SECRET: ${{ secrets.PARAMETERS_SECRET }}
327+
run: |
328+
gpg --quiet --batch --yes --decrypt --passphrase="$PARAMETERS_SECRET" \
329+
.github/workflows/parameters/public/parameters_${{ matrix.cloud-provider }}.py.gpg > test/parameters.py
330+
- name: Download wheel(s)
331+
uses: actions/download-artifact@v3
332+
with:
333+
name: ${{ matrix.os.download_name }}_py${{ matrix.python-version }}
334+
path: dist
335+
- name: Show wheels downloaded
336+
run: ls -lh dist
337+
shell: bash
338+
- name: Upgrade setuptools, pip and wheel
339+
run: python -m pip install -U setuptools pip wheel
340+
- name: Install tox
341+
run: python -m pip install tox tox-external-wheels
342+
- name: Run tests
343+
run: python -m tox -e "py${PYTHON_VERSION/\./}-{extras,unit,integ,pandas,sso}-ci"
344+
env:
345+
PYTHON_VERSION: ${{ matrix.python-version }}
346+
cloud_provider: ${{ matrix.cloud-provider }}
347+
PYTEST_ADDOPTS: --color=yes --tb=short
348+
TOX_PARALLEL_NO_SPINNER: 1
349+
TEST_USING_VENDORED_ARROW: true
350+
shell: bash
351+
- name: Combine coverages
352+
run: python -m tox -e coverage --skip-missing-interpreters false
353+
shell: bash
354+
- uses: actions/upload-artifact@v3
355+
with:
356+
name: coverage_vendored_arrow_${{ matrix.os.download_name }}-${{ matrix.python-version }}-${{ matrix.cloud-provider }}
357+
path: |
358+
.tox/.coverage
359+
.tox/coverage.xml
360+
299361
combine-coverage:
300362
if: ${{ success() || failure() }}
301363
name: Combine coverage

.pre-commit-config.yaml

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,17 @@ repos:
88
rev: v4.4.0
99
hooks:
1010
- id: trailing-whitespace
11+
exclude: >
12+
(?x)^(
13+
src/snowflake/connector/nanoarrow_cpp/ArrowIterator/flatcc/.*\.h|
14+
src/snowflake/connector/nanoarrow_cpp/ArrowIterator/nanoarrow_ipc.c|
15+
)$
1116
- id: end-of-file-fixer
12-
exclude: license_header.txt
17+
exclude: >
18+
(?x)^(
19+
license_header.txt|
20+
src/snowflake/connector/nanoarrow_cpp/ArrowIterator/flatcc/.*\.h|
21+
)$
1322
- id: check-yaml
1423
exclude: .github/repo_meta.yaml
1524
- id: debug-statements
@@ -28,18 +37,27 @@ repos:
2837
(?x)^(
2938
src/snowflake/connector/version.py|
3039
src/snowflake/connector/cpp|
40+
src/snowflake/connector/nanoarrow_cpp|
3141
)$
3242
args:
3343
- --license-filepath
3444
- license_header.txt
3545
- id: insert-license
3646
name: insert-cpp-license
37-
files: src/snowflake/connector/cpp/.*\.(cpp|hpp)$
47+
files: >
48+
(?x)^(
49+
src/snowflake/connector/cpp/.*\.(cpp|hpp)|
50+
src/snowflake/connector/nanoarrow_cpp/.*\.(cpp|hpp)|
51+
)$
3852
args:
3953
- --comment-style
4054
- //
4155
- --license-filepath
4256
- license_header.txt
57+
exclude: >
58+
(?x)^(
59+
src/snowflake/connector/nanoarrow_cpp/ArrowIterator/nanoarrow.hpp|
60+
)$
4361
- repo: https://github.com/asottile/yesqa
4462
rev: v1.5.0
4563
hooks:

DESCRIPTION.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@ Source code is also available at: https://github.com/snowflakedb/snowflake-conne
88

99
# Release Notes
1010

11+
- v3.3.0(Unreleased)
12+
- Updated to use apache arrow-nanoarrow project for result arrow data conversion.
13+
1114
- v3.2.1(September 26,2023)
1215

1316
- Fixed a bug where url port and path were ignored in private link oscp retry.

MANIFEST.in

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,20 @@ include *.rst
33
include LICENSE.txt
44
include NOTICE
55
include pyproject.toml
6+
include src/snowflake/connector/nanoarrow_cpp/ArrowIterator/LICENSE.txt
67
recursive-include src/snowflake/connector py.typed *.py *.pyx
78
recursive-include src/snowflake/connector/vendored LICENSE*
89

910
recursive-include src/snowflake/connector/cpp *.cpp *.hpp
11+
recursive-include src/snowflake/connector/cpp *.c *.h
1012
exclude src/snowflake/connector/arrow_iterator.cpp
13+
exclude src/snowflake/connector/cpp/ArrowIterator/arrow_iterator.cpp
14+
15+
recursive-include src/snowflake/connector/nanoarrow_cpp *.cpp *.hpp
16+
recursive-include src/snowflake/connector/nanoarrow_cpp *.c *.h
17+
exclude src/snowflake/connector/nanoarrow_cpp/ArrowIterator/nanoarrow_arrow_iterator.cpp
18+
exclude src/snowflake/connector/nanoarrow_cpp/scripts/.clang-format
19+
exclude src/snowflake/connector/nanoarrow_cpp/scripts/format.sh
1120

1221
exclude .git-blame-ignore-revs
1322
exclude .pre-commit-config.yaml

setup.py

Lines changed: 119 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from setuptools import Extension, setup
1212

1313
CONNECTOR_SRC_DIR = os.path.join("src", "snowflake", "connector")
14+
NANOARROW_SRC_DIR = os.path.join(CONNECTOR_SRC_DIR, "nanoarrow_cpp", "ArrowIterator")
1415

1516
VERSION = (1, 1, 1, None) # Default
1617
try:
@@ -59,10 +60,19 @@
5960
pyarrow_version = tuple(int(x) for x in pyarrow.__version__.split("."))
6061
extensions = cythonize(
6162
[
63+
# vendored arrow iterator
6264
Extension(
6365
name="snowflake.connector.arrow_iterator",
6466
sources=[os.path.join(CONNECTOR_SRC_DIR, "arrow_iterator.pyx")],
6567
),
68+
# nanoarrow iterator
69+
Extension(
70+
name="snowflake.connector.nanoarrow_arrow_iterator",
71+
sources=[
72+
os.path.join(NANOARROW_SRC_DIR, "nanoarrow_arrow_iterator.pyx")
73+
],
74+
language="c++",
75+
),
6676
],
6777
compile_time_env=dict(ARROW_LESS_THAN_8=pyarrow_version < (8,)),
6878
)
@@ -119,6 +129,7 @@ def build_extension(self, ext):
119129
ext.extra_link_args.append("-g")
120130
current_dir = os.getcwd()
121131

132+
# vendored arrow extension
122133
if ext.name == "snowflake.connector.arrow_iterator":
123134
if not os.environ.get("SF_NO_COPY_ARROW_LIB", False):
124135
self._copy_arrow_lib()
@@ -159,7 +170,10 @@ def build_extension(self, ext):
159170
if "std=" not in os.environ.get("CXXFLAGS", ""):
160171
ext.extra_compile_args.append("-std=c++17")
161172
ext.extra_compile_args.append("-D_GLIBCXX_USE_CXX11_ABI=0")
162-
if sys.platform == "darwin":
173+
if (
174+
sys.platform == "darwin"
175+
and "macosx-version-min" not in os.environ.get("CXXFLAGS", "")
176+
):
163177
ext.extra_compile_args.append("-mmacosx-version-min=10.13")
164178

165179
ext.library_dirs.append(
@@ -177,7 +191,110 @@ def build_extension(self, ext):
177191
# fyi, https://medium.com/@donblas/fun-with-rpath-otool-and-install-name-tool-e3e41ae86172
178192
ext.extra_link_args += ["-rpath", "@loader_path"]
179193

180-
build_ext.build_extension(self, ext)
194+
# nanoarrow extension
195+
if ext.name == "snowflake.connector.nanoarrow_arrow_iterator":
196+
NANOARROW_CPP_SRC_DIR = os.path.join(CONNECTOR_SRC_DIR, "nanoarrow_cpp")
197+
NANOARROW_ARROW_ITERATOR_SRC_DIR = os.path.join(
198+
NANOARROW_CPP_SRC_DIR, "ArrowIterator"
199+
)
200+
NANOARROW_LOGGING_SRC_DIR = os.path.join(
201+
NANOARROW_CPP_SRC_DIR, "Logging"
202+
)
203+
204+
ext.sources += [
205+
os.path.join(
206+
NANOARROW_ARROW_ITERATOR_SRC_DIR, "CArrowIterator.cpp"
207+
),
208+
os.path.join(
209+
NANOARROW_ARROW_ITERATOR_SRC_DIR, "CArrowChunkIterator.cpp"
210+
),
211+
os.path.join(
212+
NANOARROW_ARROW_ITERATOR_SRC_DIR, "CArrowTableIterator.cpp"
213+
),
214+
os.path.join(NANOARROW_ARROW_ITERATOR_SRC_DIR, "SnowflakeType.cpp"),
215+
os.path.join(
216+
NANOARROW_ARROW_ITERATOR_SRC_DIR, "BinaryConverter.cpp"
217+
),
218+
os.path.join(
219+
NANOARROW_ARROW_ITERATOR_SRC_DIR, "BooleanConverter.cpp"
220+
),
221+
os.path.join(
222+
NANOARROW_ARROW_ITERATOR_SRC_DIR, "DecimalConverter.cpp"
223+
),
224+
os.path.join(NANOARROW_ARROW_ITERATOR_SRC_DIR, "DateConverter.cpp"),
225+
os.path.join(
226+
NANOARROW_ARROW_ITERATOR_SRC_DIR, "FloatConverter.cpp"
227+
),
228+
os.path.join(NANOARROW_ARROW_ITERATOR_SRC_DIR, "IntConverter.cpp"),
229+
os.path.join(
230+
NANOARROW_ARROW_ITERATOR_SRC_DIR, "StringConverter.cpp"
231+
),
232+
os.path.join(NANOARROW_ARROW_ITERATOR_SRC_DIR, "TimeConverter.cpp"),
233+
os.path.join(
234+
NANOARROW_ARROW_ITERATOR_SRC_DIR, "TimeStampConverter.cpp"
235+
),
236+
os.path.join(
237+
NANOARROW_ARROW_ITERATOR_SRC_DIR, "Python", "Common.cpp"
238+
),
239+
os.path.join(
240+
NANOARROW_ARROW_ITERATOR_SRC_DIR, "Python", "Helpers.cpp"
241+
),
242+
os.path.join(NANOARROW_ARROW_ITERATOR_SRC_DIR, "Util", "time.cpp"),
243+
NANOARROW_LOGGING_SRC_DIR + "/logging.cpp",
244+
os.path.join(NANOARROW_ARROW_ITERATOR_SRC_DIR, "nanoarrow.c"),
245+
os.path.join(NANOARROW_ARROW_ITERATOR_SRC_DIR, "nanoarrow_ipc.c"),
246+
os.path.join(NANOARROW_ARROW_ITERATOR_SRC_DIR, "flatcc.c"),
247+
]
248+
ext.include_dirs.append(NANOARROW_ARROW_ITERATOR_SRC_DIR)
249+
ext.include_dirs.append(NANOARROW_LOGGING_SRC_DIR)
250+
251+
if sys.platform == "win32":
252+
if not any("/std" not in s for s in ext.extra_compile_args):
253+
ext.extra_compile_args.append("/std:c++17")
254+
elif sys.platform == "linux" or sys.platform == "darwin":
255+
if "std=" not in os.environ.get("CXXFLAGS", ""):
256+
ext.extra_compile_args.append("-std=c++17")
257+
ext.extra_compile_args.append("-D_GLIBCXX_USE_CXX11_ABI=0")
258+
if (
259+
sys.platform == "darwin"
260+
and "macosx-version-min" not in os.environ.get("CXXFLAGS", "")
261+
):
262+
ext.extra_compile_args.append("-mmacosx-version-min=10.13")
263+
264+
ext.library_dirs.append(
265+
os.path.join(current_dir, self.build_lib, "snowflake", "connector")
266+
)
267+
268+
# sys.platform for linux used to return with version suffix, (i.e. linux2, linux3)
269+
# After version 3.3, it will always be just 'linux'
270+
# https://docs.python.org/3/library/sys.html#sys.platform
271+
if sys.platform == "linux":
272+
ext.extra_link_args += ["-Wl,-rpath,$ORIGIN"]
273+
elif sys.platform == "darwin":
274+
# rpath,$ORIGIN only work on linux, did not work on darwin. use @loader_path instead
275+
# fyi, https://medium.com/@donblas/fun-with-rpath-otool-and-install-name-tool-e3e41ae86172
276+
ext.extra_link_args += ["-rpath", "@loader_path"]
277+
278+
original__compile = self.compiler._compile
279+
280+
# the following is required by nanoarrow to compile c files
281+
def new__compile(obj, src: str, ext, cc_args, extra_postargs, pp_opts):
282+
if (
283+
src.endswith("nanoarrow.c")
284+
or src.endswith("nanoarrow_ipc.c")
285+
or src.endswith("flatcc.c")
286+
):
287+
extra_postargs = [s for s in extra_postargs if s != "-std=c++17"]
288+
return original__compile(
289+
obj, src, ext, cc_args, extra_postargs, pp_opts
290+
)
291+
292+
self.compiler._compile = new__compile
293+
294+
try:
295+
build_ext.build_extension(self, ext)
296+
finally:
297+
self.compiler._compile = original__compile
181298

182299
def _get_arrow_lib_dir(self):
183300
if "SF_ARROW_LIBDIR" in os.environ:

src/snowflake/connector/arrow_context.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import time
1010
from datetime import datetime, timedelta, tzinfo
1111
from logging import getLogger
12+
from sys import byteorder
1213
from typing import TYPE_CHECKING
1314

1415
import pytz
@@ -148,3 +149,11 @@ def TIMESTAMP_NTZ_TWO_FIELD_to_numpy_datetime64(
148149
) -> datetime64:
149150
nanoseconds = int(decimal.Decimal(epoch).scaleb(9) + decimal.Decimal(fraction))
150151
return numpy.datetime64(nanoseconds, "ns")
152+
153+
def DECIMAL128_to_decimal(self, int128_bytes: bytes, scale: int) -> decimal.Decimal:
154+
int128 = int.from_bytes(int128_bytes, byteorder=byteorder, signed=True)
155+
if scale == 0:
156+
return int128
157+
digits = [int(digit) for digit in str(int128) if digit != "-"]
158+
sign = int128 < 0
159+
return decimal.Decimal((sign, digits, -scale))

src/snowflake/connector/connection.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@
2727

2828
from cryptography.hazmat.primitives.asymmetric.rsa import RSAPrivateKey
2929

30+
import snowflake.connector.cursor
31+
import snowflake.connector.result_batch
32+
3033
from . import errors, proxy
3134
from ._query_context_cache import QueryContextCache
3235
from .auth import (
@@ -57,6 +60,7 @@
5760
PARAMETER_CLIENT_TELEMETRY_OOB_ENABLED,
5861
PARAMETER_CLIENT_VALIDATE_DEFAULT_PARAMETERS,
5962
PARAMETER_ENABLE_STAGE_S3_PRIVATELINK_FOR_US_EAST_1,
63+
PARAMETER_PYTHON_CONNECTOR_USE_NANOARROW,
6064
PARAMETER_QUERY_CONTEXT_CACHE_SIZE,
6165
PARAMETER_SERVICE_NAME,
6266
PARAMETER_TIMEZONE,
@@ -369,6 +373,11 @@ def __init__(
369373
# connection_name is None and kwargs was empty when called
370374
kwargs = _get_default_connection_params()
371375
self.__set_error_attributes()
376+
# by default, nanoarrow converter is used, the following two will be reset in self.__open_connection
377+
self._server_use_nanoarrow_converter_parameter = True
378+
self._create_pyarrow_iterator_method = (
379+
snowflake.connector.result_batch._create_nanoarrow_iterator
380+
)
372381
self.connect(**kwargs)
373382
self._telemetry = TelemetryClient(self._rest)
374383

@@ -926,6 +935,22 @@ def __open_connection(self):
926935
# By this point it should have been decided if the heartbeat has to be enabled
927936
# and what would the heartbeat frequency be
928937
self._add_heartbeat()
938+
if (
939+
(
940+
snowflake.connector.cursor.NANOARROW_USAGE
941+
== snowflake.connector.cursor.NanoarrowUsage.FOLLOW_SESSION_PARAMETER
942+
and self._server_use_nanoarrow_converter_parameter
943+
)
944+
or snowflake.connector.cursor.NANOARROW_USAGE
945+
== snowflake.connector.cursor.NanoarrowUsage.ENABLE_NANOARROW
946+
):
947+
self._create_pyarrow_iterator_method = (
948+
snowflake.connector.result_batch._create_nanoarrow_iterator
949+
)
950+
else:
951+
self._create_pyarrow_iterator_method = (
952+
snowflake.connector.result_batch._create_vendored_arrow_iterator
953+
)
929954

930955
def __config(self, **kwargs):
931956
"""Sets up parameters in the connection object."""
@@ -1548,6 +1573,8 @@ def _update_parameters(
15481573
self.enable_stage_s3_privatelink_for_us_east_1 = value
15491574
elif PARAMETER_QUERY_CONTEXT_CACHE_SIZE == name:
15501575
self.query_context_cache_size = value
1576+
elif PARAMETER_PYTHON_CONNECTOR_USE_NANOARROW == name:
1577+
self._server_use_nanoarrow_converter_parameter = value
15511578

15521579
def _format_query_for_log(self, query: str) -> str:
15531580
ret = " ".join(line.strip() for line in query.split("\n"))

src/snowflake/connector/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,7 @@ class FileHeader(NamedTuple):
246246
"ENABLE_STAGE_S3_PRIVATELINK_FOR_US_EAST_1"
247247
)
248248
PARAMETER_MULTI_STATEMENT_COUNT = "MULTI_STATEMENT_COUNT"
249+
PARAMETER_PYTHON_CONNECTOR_USE_NANOARROW = "PYTHON_CONNECTOR_USE_NANOARROW"
249250

250251
HTTP_HEADER_CONTENT_TYPE = "Content-Type"
251252
HTTP_HEADER_CONTENT_ENCODING = "Content-Encoding"

0 commit comments

Comments
 (0)