Merge branch 'main' into SNOW-1955847-feat-add-support-to-postgresql

sfc-gh-aling · web-flow · commit 7e70380137b9 · 2025-05-12T15:45:48.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,7 +12,8 @@
 
 - Invoking snowflake system procedures does not invoke an additional `describe procedure` call to check the return type of the procedure.
 - Added support for `Session.create_dataframe()` with the stage URL and FILE data type.
-- Added support for different modes for dealing with corrupt XML records when reading an XML file using `session.read.option('rowTag', <tag_name>).xml(<stage_file_path>)`. Currently `PERMISSIVE`, `DROPMALFORMED` and `FAILFAST` are supported.
+- Added support for different modes for dealing with corrupt XML records when reading an XML file using `session.read.option('mode', <mode>), option('rowTag', <tag_name>).xml(<stage_file_path>)`. Currently `PERMISSIVE`, `DROPMALFORMED` and `FAILFAST` are supported.
+- Improved the error message of the XML reader when the specified row tag is not found in the file.
 - Improved query generation for `Dataframe.drop` to use `SELECT * EXCLUDE ()` to exclude the dropped columns. To enable this feature, set `session.conf.set("use_simplified_query_generation", True)`.
 
 #### Bug Fixes
diff --git a/src/snowflake/snowpark/_internal/analyzer/snowflake_plan.py b/src/snowflake/snowpark/_internal/analyzer/snowflake_plan.py
@@ -150,6 +150,10 @@ def wrap(*args, **kwargs):
                 try:
                     return func(*args, **kwargs)
                 except snowflake.connector.errors.ProgrammingError as e:
+                    from snowflake.snowpark._internal.analyzer.select_statement import (
+                        Selectable,
+                    )
+
                     query = getattr(e, "query", None)
                     tb = sys.exc_info()[2]
                     assert e.msg is not None
@@ -209,10 +213,6 @@ def wrap(*args, **kwargs):
                             )
                             raise ne.with_traceback(tb) from None
                         else:
-                            from snowflake.snowpark._internal.analyzer.select_statement import (
-                                Selectable,
-                            )
-
                             # We need the potential double quotes for invalid identifier
                             match = SnowflakePlan.Decorator.__wrap_exception_regex_match_with_double_quotes.match(
                                 e.msg
@@ -277,11 +277,53 @@ def add_single_quote(string: str) -> str:
                                 e
                             )
                             raise ne.with_traceback(tb) from None
-                    else:
-                        ne = SnowparkClientExceptionMessages.SQL_EXCEPTION_FROM_PROGRAMMING_ERROR(
-                            e
-                        )
-                        raise ne.with_traceback(tb) from None
+                    elif e.sqlstate == "42601" and "SELECT with no columns" in e.msg:
+                        # This is a special case when the select statement has no columns,
+                        # and it's a reading XML query.
+
+                        def search_read_file_node(
+                            node: Union[SnowflakePlan, Selectable]
+                        ) -> Optional[ReadFileNode]:
+                            for child in node.children_plan_nodes:
+                                source_plan = (
+                                    child.source_plan
+                                    if isinstance(child, SnowflakePlan)
+                                    else child.snowflake_plan.source_plan
+                                )
+                                if isinstance(source_plan, ReadFileNode):
+                                    return source_plan
+                                result = search_read_file_node(child)
+                                if result:
+                                    return result
+                            return None
+
+                        for arg in args:
+                            if isinstance(arg, SnowflakePlan):
+                                read_file_node = search_read_file_node(arg)
+                                if (
+                                    read_file_node
+                                    and read_file_node.xml_reader_udtf is not None
+                                ):
+                                    row_tag = read_file_node.options.get(
+                                        XML_ROW_TAG_STRING
+                                    )
+                                    file_path = read_file_node.path
+                                    ne = SnowparkClientExceptionMessages.DF_XML_ROW_TAG_NOT_FOUND(
+                                        row_tag, file_path
+                                    )
+                                    raise ne.with_traceback(tb) from None
+                            # when the describe query fails, the arg is a query string
+                            elif isinstance(arg, str):
+                                if f'"{XML_ROW_DATA_COLUMN_NAME}"' in arg:
+                                    ne = (
+                                        SnowparkClientExceptionMessages.DF_XML_ROW_TAG_NOT_FOUND()
+                                    )
+                                    raise ne.with_traceback(tb) from None
+
+                    ne = SnowparkClientExceptionMessages.SQL_EXCEPTION_FROM_PROGRAMMING_ERROR(
+                        e
+                    )
+                    raise ne.with_traceback(tb) from None
 
             return wrap
 
diff --git a/src/snowflake/snowpark/_internal/error_message.py b/src/snowflake/snowpark/_internal/error_message.py
@@ -127,6 +127,17 @@ def DF_COPY_INTO_CANNOT_CREATE_TABLE(
             f"Cannot create the target table {table_name} because Snowpark cannot determine the column names to use. You should create the table before calling copy_into_table()."
         )
 
+    @staticmethod
+    def DF_XML_ROW_TAG_NOT_FOUND(
+        row_tag: Optional[str] = None,
+        file_path: Optional[str] = None,
+    ) -> SnowparkDataframeReaderException:
+        if row_tag is not None and file_path is not None:
+            msg = f"Cannot find the row tag '{row_tag}' in the XML file {file_path}."
+        else:
+            msg = "Cannot find the row tag in the XML file."
+        return SnowparkDataframeReaderException(msg)
+
     @staticmethod
     def DF_CROSS_TAB_COUNT_TOO_LARGE(
         count: int, max_count: int
diff --git a/tests/integ/scala/test_dataframe_reader_suite.py b/tests/integ/scala/test_dataframe_reader_suite.py
@@ -2093,3 +2093,25 @@ def test_read_malformed_xml(session, file):
     )
     with pytest.raises(SnowparkSQLException, match="Malformed XML record at bytes"):
         df.collect()
+
+
+@pytest.mark.skipif(
+    "config.getoption('local_testing_mode', default=False)",
+    reason="xml not supported in local testing mode",
+)
+def test_read_xml_row_tag_not_found(session):
+    row_tag = "non-existing-tag"
+    df = session.read.option("rowTag", row_tag).xml(
+        f"@{tmp_stage_name1}/{test_file_books_xml}"
+    )
+
+    with pytest.raises(
+        SnowparkDataframeReaderException, match="Cannot find the row tag"
+    ):
+        df.collect()
+
+    # also works for nested query plan
+    with pytest.raises(
+        SnowparkDataframeReaderException, match="Cannot find the row tag"
+    ):
+        df.filter(lit(True)).collect()