impl

sfc-gh-aling · sfc-gh-aling · commit 8aa988efc35e · 2025-05-12T15:44:24.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,10 @@
 
 ### Snowpark Python API Updates
 
+#### New Features
+
+- Added PostgreSQL support to `DataFrameReader.dbapi` (PrPr) for both Parquet and UDTF-based ingestion.
+
 #### Improvements
 
 - Invoking snowflake system procedures does not invoke an additional `describe procedure` call to check the return type of the procedure.
diff --git a/src/snowflake/snowpark/_internal/data_source/drivers/base_driver.py b/src/snowflake/snowpark/_internal/data_source/drivers/base_driver.py
@@ -40,8 +40,8 @@ def to_snow_type(self, schema: List[Any]) -> StructType:
             f"{self.__class__.__name__} has not implemented to_snow_type function"
         )
 
+    @staticmethod
     def prepare_connection(
-        self,
         conn: "Connection",
         query_timeout: int = 0,
     ) -> "Connection":
@@ -81,7 +81,7 @@ def udtf_ingestion(
         udtf_name = f"data_source_udtf_{generate_random_alphanumeric(5)}"
         start = time.time()
         session.udtf.register(
-            self.udtf_class_builder(fetch_size=fetch_size),
+            self.udtf_class_builder(fetch_size=fetch_size, schema=schema),
             name=udtf_name,
             output_schema=StructType(
                 [
@@ -104,7 +104,9 @@ def udtf_ingestion(
         ]
         return res.select(cols)
 
-    def udtf_class_builder(self, fetch_size: int = 1000) -> type:
+    def udtf_class_builder(
+        self, fetch_size: int = 1000, schema: StructType = None
+    ) -> type:
         create_connection = self.create_connection
 
         class UDTFIngestion:
diff --git a/src/snowflake/snowpark/_internal/data_source/drivers/oracledb_driver.py b/src/snowflake/snowpark/_internal/data_source/drivers/oracledb_driver.py
@@ -109,8 +109,8 @@ def to_snow_type(self, schema: List[Any]) -> StructType:
 
         return StructType(fields)
 
+    @staticmethod
     def prepare_connection(
-        self,
         conn: "Connection",
         query_timeout: int = 0,
     ) -> "Connection":
diff --git a/src/snowflake/snowpark/_internal/data_source/drivers/psycopg2_driver.py b/src/snowflake/snowpark/_internal/data_source/drivers/psycopg2_driver.py
@@ -173,16 +173,10 @@ def __init__(
         super().__init__(create_connection, dbms_type)
 
     def to_snow_type(self, schema: List[Any]) -> StructType:
-        # TODO: Implement this method to convert PostgreSQL types to Snowflake types.
-        # https://other-docs.snowflake.com/en/connectors/postgres6/view-data#postgresql-to-snowflake-data-type-mapping
-        # psycopg2 type code: https://github.com/psycopg/psycopg2/blob/master/psycopg/pgtypes.h
-        # https://www.postgresql.org/docs/current/datatype.html
+        # The psycopg2 spec is defined in the following links:
         # https://www.psycopg.org/docs/cursor.html#cursor.description
-        # https://www.psycopg.org/docs/extensions.html#psycopg2.extensions.Column.type_code
-        #   https://www.postgresql.org/docs/current/catalog-pg-type.html
-        #   https://www.psycopg.org/docs/advanced.html#type-casting-from-sql-to-python
-        fields = []
         # https://www.psycopg.org/docs/extensions.html#psycopg2.extensions.Column
+        fields = []
         for (
             name,
             type_code,
@@ -222,14 +216,6 @@ def data_source_data_to_pandas_df(
         data: List[Any], schema: StructType
     ) -> "pd.DataFrame":
         df = BaseDriver.data_source_data_to_pandas_df(data, schema)
-        # psycopg2 returns binary data as memoryview, we need to convert it to bytes
-        binary_type_indexes = [
-            i
-            for i, field in enumerate(schema.fields)
-            if isinstance(field.datatype, BinaryType)
-        ]
-        col_names = df.columns[binary_type_indexes]
-        df[col_names] = BaseDriver.df_map_method(df[col_names])(lambda x: bytes(x))
 
         variant_type_indexes = [
             i
@@ -259,8 +245,8 @@ def to_result_snowpark_df(
             project_columns, _emit_ast=_emit_ast
         )
 
+    @staticmethod
     def prepare_connection(
-        self,
         conn: "Connection",
         query_timeout: int = 0,
     ) -> "Connection":
@@ -275,4 +261,97 @@ def prepare_connection(
             lambda data, cursor: data,
         )
         register_type(SNOWPARK_INTERVAL_STR, conn)
+
+        # by default psycopg2 returns binary data as memoryview
+        # to avoid using pandas to convert memoryview to bytes, we use the following native psycopg2 type conversion
+        # psycopg2.extensions.new_type() only works for text format data, it returns bytes as hex string
+        # we reconstruct the bytes from hex string
+        SNOWPARK_BYTE = new_type(
+            (Psycopg2TypeCode.BYTEAOID.value,),
+            "SNOWPARK_BYTE_BYTES",
+            lambda data, cursor: bytes.fromhex(data[2:])
+            if data is not None
+            else None,  # [2:] to skip the '\\x' prefix
+        )
+        register_type(SNOWPARK_BYTE, conn)
+
+        if query_timeout:
+            # https://www.postgresql.org/docs/current/runtime-config-client.html#GUC-STATEMENT-TIMEOUT
+            # postgres default uses milliseconds
+            conn.cursor().execute(f"SET STATEMENT_TIMEOUT = {query_timeout * 1000}")
         return conn
+
+    def udtf_class_builder(
+        self, fetch_size: int = 1000, schema: StructType = None
+    ) -> type:
+        create_connection = self.create_connection
+
+        # TODO: SNOW-2101485 ues class method to prepare connection
+        # ideally we should use the same function as prepare_connection
+        # however, since we introduce new module for new driver support and initially the new module is not available in the backend
+        # so if registering UDTF which uses the class method, cloudpickle will pickle the class method along with
+        # the new module -- this leads to not being able to find the new module when unpickling on the backend.
+        # once the new module is available in the backend, we can use the class method.
+        def prepare_connection_in_udtf(
+            conn: "Connection",
+            query_timeout: int = 0,
+        ) -> "Connection":
+            # The following is to align with Snowflake Connector behavior which get Interval as string
+            # the default behavior of psycopg2 is to get Interval as datetime.timedelta
+            # https://other-docs.snowflake.com/en/connectors/postgres6/view-data#postgresql-to-snowflake-data-type-mapping
+            from psycopg2.extensions import new_type, register_type
+
+            # we do not use Psycopg2TypeCode.INTERVALOID.value because UTDF pickles the psycopg2_driver module
+            # unpickling in the UDTF would results in module not found error if package not available in the backend
+            SNOWPARK_INTERVAL_STR = new_type(
+                (1186,),
+                "SNOWPARK_INTERVAL_STR",
+                lambda data, cursor: data,
+            )
+            register_type(SNOWPARK_INTERVAL_STR, conn)
+
+            if query_timeout:
+                # https://www.postgresql.org/docs/current/runtime-config-client.html#GUC-STATEMENT-TIMEOUT
+                # postgres default uses milliseconds
+                conn.cursor().execute(f"SET STATEMENT_TIMEOUT = {query_timeout * 1000}")
+            return conn
+
+        binary_column_indexes = [
+            i
+            for i, field in enumerate(schema.fields)
+            if isinstance(field.datatype, BinaryType)
+        ]
+        time_column_indexes = [
+            i
+            for i, field in enumerate(schema.fields)
+            if isinstance(field.datatype, TimeType)
+        ]
+
+        # postgres returns binary data as memoryview, we need to convert it to bytes
+        def convert_rows(rows_to_update):
+            ret = []
+            for row in rows_to_update:
+                # convert tuple to list to make it mutable
+                new_row = list(row)
+                # convert bytes to hexstring so that variant column can be cast to bytes
+                for idx in binary_column_indexes:
+                    new_row[idx] = bytes(row[idx]).hex() if row[idx] else None
+                # remove timezone info from time columns
+                for idx in time_column_indexes:
+                    new_row[idx] = row[idx].replace(tzinfo=None) if row[idx] else None
+                # convert list back to tuple as UDTF requires tuple
+                ret.append(tuple(new_row))
+            return ret
+
+        class UDTFIngestion:
+            def process(self, query: str):
+                conn = prepare_connection_in_udtf(create_connection())
+                cursor = conn.cursor()
+                cursor.execute(query)
+                while True:
+                    rows = cursor.fetchmany(fetch_size)
+                    if not rows:
+                        break
+                    yield from convert_rows(rows)
+
+        return UDTFIngestion
diff --git a/src/snowflake/snowpark/_internal/data_source/drivers/pyodbc_driver.py b/src/snowflake/snowpark/_internal/data_source/drivers/pyodbc_driver.py
@@ -116,8 +116,8 @@ def process(self, query: str):
 
         return UDTFIngestion
 
+    @staticmethod
     def prepare_connection(
-        self,
         conn: "Connection",
         query_timeout: int = 0,
     ) -> "Connection":
diff --git a/tests/integ/datasource/test_postgres.py b/tests/integ/datasource/test_postgres.py
@@ -5,6 +5,7 @@
 import datetime
 from decimal import Decimal
 
+from snowflake.snowpark import Row
 from snowflake.snowpark.exceptions import SnowparkDataframeReaderException
 from snowflake.snowpark.types import (
     StructType,
@@ -40,7 +41,7 @@
 ]
 
 
-TEST_TABLE_NAME = "test_schema.ALL_TYPE_TABLE"
+POSTGRES_TABLE_NAME = "test_schema.ALL_TYPE_TABLE"
 EXPECTED_TEST_DATA = [
     (
         -6645531000000000000,
@@ -267,6 +268,51 @@
         "960b86a9-a8dd-4634-bc1f-956ae6589726",
         "<root><element>47</element></root>",
     ),
+    (
+        None,
+        6,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        "null",
+        "null",
+        None,
+        None,
+        None,
+        None,
+        "null",
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        6,
+        6,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+    ),
 ]
 EXPECTED_TYPE = StructType(
     [
@@ -319,6 +365,7 @@
         StructField("XML_COL", StringType(16777216), nullable=True),
     ]
 )
+POSTGRES_TEST_EXTERNAL_ACCESS_INTEGRATION = "snowpark_dbapi_postgres_test_integration"
 
 
 def create_postgres_connection():
@@ -327,7 +374,10 @@ def create_postgres_connection():
 
 @pytest.mark.parametrize(
     "input_type, input_value",
-    [("table", TEST_TABLE_NAME), ("query", f"(SELECT * FROM {TEST_TABLE_NAME})")],
+    [
+        ("table", POSTGRES_TABLE_NAME),
+        ("query", f"(SELECT * FROM {POSTGRES_TABLE_NAME})"),
+    ],
 )
 def test_basic_postgres(session, input_type, input_value):
     input_dict = {
@@ -350,3 +400,58 @@ def test_error_case(session, input_type, input_value, error_message):
     }
     with pytest.raises(SnowparkDataframeReaderException, match=error_message):
         session.read.dbapi(create_postgres_connection, **input_dict)
+
+
+def test_query_timeout(session):
+    with pytest.raises(
+        SnowparkDataframeReaderException,
+        match=r"due to exception 'QueryCanceled\('canceling statement due to statement timeout",
+    ):
+        session.read.dbapi(
+            create_postgres_connection,
+            table=POSTGRES_TABLE_NAME,
+            query_timeout=1,
+            session_init_statement=["SELECT pg_sleep(5)"],
+        )
+
+
+def test_external_access_integration_not_set(session):
+    with pytest.raises(
+        ValueError,
+        match="external_access_integration cannot be None when udtf ingestion is used.",
+    ):
+        session.read.dbapi(
+            create_postgres_connection, table=POSTGRES_TABLE_NAME, udtf_configs={}
+        )
+
+
+def test_unicode_column_name_postgres(session):
+    df = session.read.dbapi(
+        create_postgres_connection, table='test_schema."用户資料"'
+    ).order_by("編號")
+    assert df.collect() == [Row(編號=1, 姓名="山田太郎", 國家="日本", 備註="これはUnicodeテストです")]
+    assert df.columns == ['"編號"', '"姓名"', '"國家"', '"備註"']
+
+
+def test_udtf_ingestion_postgres(session, caplog):
+    from tests.parameters import POSTGRES_CONNECTION_PARAMETERS
+
+    def create_connection_postgres():
+        import psycopg2
+
+        return psycopg2.connect(**POSTGRES_CONNECTION_PARAMETERS)
+
+    df = session.read.dbapi(
+        create_connection_postgres,
+        table=POSTGRES_TABLE_NAME,
+        udtf_configs={
+            "external_access_integration": POSTGRES_TEST_EXTERNAL_ACCESS_INTEGRATION
+        },
+    ).order_by("BIGSERIAL_COL")
+
+    assert df.collect() == EXPECTED_TEST_DATA
+    # assert UDTF creation and UDTF call
+    assert (
+        "TEMPORARY  FUNCTION  data_source_udtf_" "" in caplog.text
+        and "table(data_source_udtf" in caplog.text
+    )
diff --git a/tox.ini b/tox.ini
@@ -223,7 +223,7 @@ deps =
     {[testenv]deps}
     databricks-sql-connector
     oracledb
-    psycopg2
+    psycopg2-binary
 commands = {env:SNOWFLAKE_PYTEST_CMD} -m "{env:SNOWFLAKE_TEST_TYPE}" {posargs:} tests/integ/datasource
 
 [pytest]