SNOW-747988: add stress tests and minor updates (#1653)

sfc-gh-aling · sfc-gh-aling · commit 022562d24a0f · 2023-08-01T00:06:54.000-07:00
diff --git a/setup.cfg b/setup.cfg
@@ -9,7 +9,7 @@ author_email = snowflake-python-libraries-dl@snowflake.com
 license = Apache-2.0
 license_files = LICENSE.txt, NOTICE
 classifiers =
-    Development Status :: 4 - Beta
+    Development Status :: 3 - Alpha
     Environment :: Console
     Environment :: Other Environment
     Intended Audience :: Developers
diff --git a/src/snowflake/connector/cpp/ArrowIterator/CArrowTableIterator.cpp b/src/snowflake/connector/cpp/ArrowIterator/CArrowTableIterator.cpp
@@ -78,7 +78,6 @@ void CArrowTableIterator::reconstructRecordBatches_nanoarrow()
           }
           if (scale > 0 && columnSchemaView.type != ArrowType::NANOARROW_TYPE_DECIMAL128)
           {
-            // TODO: this log is causing seg fault
             logger->debug(__FILE__, __func__, __LINE__, "Convert fixed number column to double column, column scale %d, column type id: %d",
               scale, columnSchemaView.type);
             convertScaledFixedNumberColumn_nanoarrow(
diff --git a/src/snowflake/connector/cpp/ArrowIterator/arrow_iterator.pyx b/src/snowflake/connector/cpp/ArrowIterator/arrow_iterator.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2012-2021 Snowflake Computing Inc. All rights reserved.
+# Copyright (c) 2012-2023 Snowflake Computing Inc. All rights reserved.
 #
 
 # distutils: language = c++
@@ -213,12 +213,12 @@ cdef class PyArrowIterator(EmptyPyArrowIterator):
         self.unit = 'table'
         self.nanoarrow_Table = self.cIterator.getArrowArrayPtrs()
         self.nanoarrow_Schema = self.cIterator.getArrowSchemaPtrs()
-        cdef vector[PyObject] py_batches
-        batches = []
-        for i in range(self.nanoarrow_Table.size()):
-            array_ptr = self.nanoarrow_Table[i]
-            schema_ptr = self.nanoarrow_Schema[i]
-            batch = pyarrow.RecordBatch._import_from_c(array_ptr, schema_ptr)
-            batches.append(batch)
-        self.pyarrow_table = pyarrow.Table.from_batches(batches=batches)
+        self.pyarrow_table = pyarrow.Table.from_batches(
+            batches=[
+                pyarrow.RecordBatch._import_from_c(
+                    self.nanoarrow_Table[i],
+                    self.nanoarrow_Schema[i]
+                ) for i in range(self.nanoarrow_Table.size())
+            ]
+        )
         snow_logger.debug(msg=f"Batches read: {self.nanoarrow_Table.size()}", path_name=__file__, func_name="init_table_unit")
diff --git a/src/snowflake/connector/version.py b/src/snowflake/connector/version.py
@@ -1,3 +1,3 @@
 # Update this for the versions
 # Don't change the forth version number from None
-VERSION = (3, 1, 0, None)
+VERSION = (3, 1, "0a1", None)
diff --git a/stress/__init__.py b/stress/__init__.py
diff --git a/stress/unit/local_iterator.py b/stress/unit/local_iterator.py
diff --git a/test/stress/__init__.py b/test/stress/__init__.py
@@ -0,0 +1,3 @@
+#
+# Copyright (c) 2012-2023 Snowflake Computing Inc. All rights reserved.
+#
diff --git a/test/stress/dev_requirements.txt b/test/stress/dev_requirements.txt
diff --git a/test/stress/e2e_iterator.py b/test/stress/e2e_iterator.py
@@ -0,0 +1,125 @@
+#
+# Copyright (c) 2012-2023 Snowflake Computing Inc. All rights reserved.
+#
+
+import argparse
+
+import util as stress_util
+from util import task_memory_decorator, task_time_execution_decorator
+
+import snowflake.connector
+from parameters import CONNECTION_PARAMETERS
+
+stress_util.print_to_console = False
+can_draw = True
+try:
+    import matplotlib.pyplot as plt
+except ImportError:
+    can_draw = False
+
+
+def prepare_data(cursor, row_count=100, test_table_name="TEMP_ARROW_TEST_TABLE"):
+    cursor.execute(
+        f"""
+CREATE TEMP TABLE {test_table_name} (
+    C1 BIGINT, C2 BINARY, C3 BOOLEAN, C4 CHAR, C5 CHARACTER, C6 DATE, C7 DATETIME, C8 DEC(12,3),
+    C9 DECIMAL(12,3), C10 DOUBLE, C11 FLOAT, C12 INT, C13 INTEGER, C14 NUMBER, C15 REAL, C16 BYTEINT,
+    C17 SMALLINT, C18 STRING, C19 TEXT, C20 TIME, C21 TIMESTAMP, C22 TIMESTAMP_TZ, C23 TIMESTAMP_LTZ,
+    C24 TIMESTAMP_NTZ, C25 TINYINT, C26 VARBINARY, C27 VARCHAR);
+"""
+    )
+
+    for _ in range(row_count):
+        cursor.execute(
+            f"""
+INSERT INTO {test_table_name} SELECT
+    123456,
+    TO_BINARY('HELP', 'UTF-8'),
+    TRUE,
+    'a',
+    'b',
+    '2023-07-18',
+    '2023-07-18 12:51:00',
+    984.28,
+    268.35,
+    123.456,
+    738.132,
+    6789,
+    23456,
+    12583,
+    513.431,
+    10,
+    9,
+    'abc456',
+    'def123',
+    '12:34:56',
+    '2021-01-01 00:00:00 +0000',
+    '2021-01-01 00:00:00 +0000',
+    '2021-01-01 00:00:00 +0000',
+    '2021-01-01 00:00:00 +0000',
+    1,
+    TO_BINARY('HELP', 'UTF-8'),
+    'vxlmls!21321#@!#!'
+;
+"""
+        )
+
+
+def task_fetch_rows(cursor, table_name):
+    ret = cursor.execute(f"select * from {table_name}").fetchall()
+    for _ in ret:
+        pass
+
+
+def task_fetch_arrow_batches(cursor, table_name):
+    ret = cursor.execute(f"select * from {table_name}").fetch_arrow_batches()
+    for _ in ret:
+        pass
+
+
+def execute_task(task, cursor, table_name, iteration_cnt):
+    for _ in range(iteration_cnt):
+        task(cursor, table_name)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--iteration_cnt", type=int, default=5000)
+    parser.add_argument("--data_file", type=str, default="test_data")
+    parser.add_argument("--row_count", type=int, default=100)
+    parser.add_argument("--test_table_name", type=str, default="ARROW_TEST_TABLE")
+    args = parser.parse_args()
+
+    test_table_name = "TEMP_ARROW_TEST_TABLE"
+
+    with snowflake.connector.connect(
+        **CONNECTION_PARAMETERS
+    ) as conn, conn.cursor() as cursor:
+        if not args.test_table_name:
+            print("preparing data started")
+            prepare_data(cursor, args.row_count)
+            print("preparing data is done")
+        else:
+            print("using data in existing table")
+            test_table_name = args.test_table_name
+
+        memory_check_task = task_memory_decorator(task_fetch_arrow_batches)
+        execute_task(memory_check_task, cursor, test_table_name, args.iteration_cnt)
+        memory_records = stress_util.collect_memory_records()
+
+        perf_check_task = task_time_execution_decorator(task_fetch_arrow_batches)
+        execute_task(perf_check_task, cursor, test_table_name, args.iteration_cnt)
+        time_records = stress_util.collect_time_execution_records()
+
+        print("average time is", sum(time_records) / len(time_records))
+
+        if can_draw:
+            plt.plot([i for i in range(len(time_records))], time_records)
+            plt.title("per iteration execution time")
+            plt.show()
+            plt.plot(
+                [item[0] for item in memory_records],
+                [item[1] for item in memory_records],
+            )
+            plt.title("memory usage")
+            plt.show()
diff --git a/test/stress/local_iterator.py b/test/stress/local_iterator.py
@@ -0,0 +1,119 @@
+#
+# Copyright (c) 2012-2023 Snowflake Computing Inc. All rights reserved.
+#
+
+import argparse
+import base64
+import io
+
+import util as stress_util
+from util import task_memory_decorator, task_time_execution_decorator
+
+from snowflake.connector.arrow_context import ArrowConverterContext
+from snowflake.connector.arrow_iterator import PyArrowIterator
+from snowflake.connector.version import VERSION
+
+stress_util.print_to_console = False
+can_draw = True
+try:
+    import matplotlib.pyplot as plt
+except ImportError:
+    can_draw = False
+
+
+def create_pyarrow_iterator(input_data):
+    # create nanoarrow based iterator
+    return PyArrowIterator(
+        None,
+        input_data,
+        ArrowConverterContext(session_parameters={"TIMEZONE": "America/Los_Angeles"}),
+        False,
+        False,
+        False,
+    )
+
+
+def create_old_pyarrow_iterator(input_data):
+    # created vendored arrow based iterator
+    return PyArrowIterator(
+        None,
+        io.BytesIO(input_data),
+        ArrowConverterContext(session_parameters={"TIMEZONE": "America/Los_Angeles"}),
+        False,
+        False,
+        False,
+    )
+
+
+def task_for_loop_iterator(input_data, create_iterator_method):
+    for _ in create_iterator_method(input_data):
+        pass
+
+
+def task_for_loop_table_iterator(input_data, create_iterator_method):
+    iterator = create_iterator_method(input_data)
+    iterator.init_table_unit()
+    for _ in iterator:
+        pass
+
+
+def execute_task(task, bytes_data, create_iterator_method, iteration_cnt):
+    for _ in range(iteration_cnt):
+        task(bytes_data, create_iterator_method)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--iteration_cnt", type=int, default=100000)
+    parser.add_argument("--data_file", type=str, default="test_data")
+    args = parser.parse_args()
+
+    with open(args.data_file) as f:
+        b64data = f.read()
+
+    decode_bytes = base64.b64decode(b64data)
+
+    # if connector is pre-release, then it's nanoarrow based iterator
+    print(
+        "Testing connector version: ",
+        ".".join([str(v) for v in VERSION if v is not None]),
+    )
+    create_arrow_iterator_method = (
+        create_old_pyarrow_iterator
+        if str(VERSION[2]).isdigit()
+        else create_pyarrow_iterator
+    )
+
+    perf_check_task_for_loop_iterator = task_time_execution_decorator(
+        task_for_loop_table_iterator
+    )
+    memory_check_task_for_loop_iterator = task_memory_decorator(
+        task_for_loop_table_iterator
+    )
+
+    execute_task(
+        memory_check_task_for_loop_iterator,
+        decode_bytes,
+        create_arrow_iterator_method,
+        args.iteration_cnt,
+    )
+    memory_records = stress_util.collect_memory_records()
+    execute_task(
+        perf_check_task_for_loop_iterator,
+        decode_bytes,
+        create_arrow_iterator_method,
+        args.iteration_cnt,
+    )
+    time_records = stress_util.collect_time_execution_records()
+
+    print("average time is", sum(time_records) / len(time_records))
+
+    if can_draw:
+        plt.plot([i for i in range(len(time_records))], time_records)
+        plt.title("per iteration execution time")
+        plt.show()
+        plt.plot(
+            [item[0] for item in memory_records], [item[1] for item in memory_records]
+        )
+        plt.title("memory usage")
+        plt.show()
diff --git a/test/stress/util.py b/test/stress/util.py
@@ -1,3 +1,7 @@
+#
+# Copyright (c) 2012-2023 Snowflake Computing Inc. All rights reserved.
+#
+
 import os
 import time
 

Original file line number	Diff line number	Diff line change
`@@ -78,7 +78,6 @@ void CArrowTableIterator::reconstructRecordBatches_nanoarrow()`
`78`	`78`	`}`
`79`	`79`	`if (scale > 0 && columnSchemaView.type != ArrowType::NANOARROW_TYPE_DECIMAL128)`
`80`	`80`	`{`
`81`		`- // TODO: this log is causing seg fault`
`82`	`81`	`logger->debug(__FILE__, __func__, __LINE__, "Convert fixed number column to double column, column scale %d, column type id: %d",`
`83`	`82`	`scale, columnSchemaView.type);`
`84`	`83`	`convertScaledFixedNumberColumn_nanoarrow(`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+#`
	`2`	`+# Copyright (c) 2012-2023 Snowflake Computing Inc. All rights reserved.`
	`3`	`+#`