SNOW-2403463: Fix xml reader with undeclared namespace (#3869)

sfc-gh-jdu · web-flow · commit 4b174e9edbcc · 2025-10-10T22:41:39.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -44,6 +44,7 @@
 
 #### Bug Fixes
 
+- Fixed a bug that `DataFrameReader.xml` fails to parse XML files with undeclared namespaces when `ignoreNamespace` is `True`.
 - Added a fix for floating point precision discrepancies in `interval_day_time_from_parts`.
 - Fixed a bug where writing Snowpark pandas dataframes on the pandas backend with a column multiindex to Snowflake with `to_snowflake` would raise `KeyError`.
 - Fixed a bug that `DataFrameReader.dbapi` (PuPr) is not compatible with oracledb 3.4.0. 
diff --git a/src/snowflake/snowpark/_internal/xml_reader.py b/src/snowflake/snowpark/_internal/xml_reader.py
@@ -468,7 +468,22 @@ def process_xml_range(
                     # to parse undeclared namespaces, we have to use recover mode
                     recover = bool(":" in tag_name)
                     parser = ET.XMLParser(recover=recover, ns_clean=True)
-                    element = ET.fromstring(record_str, parser)
+                    try:
+                        element = ET.fromstring(record_str, parser)
+                    except ET.XMLSyntaxError:
+                        # when ignoring namespaces, strip attribute prefixes
+                        # like xyz:id -> id so records with undeclared prefixes can still parse.
+                        if ignore_namespace:
+                            try:
+                                cleaned_record = re.sub(
+                                    r"\s+(\w+):(\w+)=", r" \2=", record_str
+                                )
+                                element = ET.fromstring(cleaned_record, parser)
+                            except Exception as inner_ex:
+                                # avoid chained exceptions
+                                raise inner_ex from None
+                        else:
+                            raise
                 else:
                     element = ET.fromstring(record_str)
 
diff --git a/src/snowflake/snowpark/dataframe_reader.py b/src/snowflake/snowpark/dataframe_reader.py
@@ -1031,7 +1031,9 @@ def xml(self, path: str, _emit_ast: bool = True) -> DataFrame:
                 The default value is '_corrupt_record'.
 
               + ``ignoreNamespace``: remove namespace prefixes from XML element names when constructing result column names.
-                The default value is ``True``. Note that a given prefix isn't declared on the row tag element,
+                The default value is ``True``. Parsing uses recovery mode to tolerate malformed records (e.g., undefined
+                namespace prefixes in attributes such as ``diffgr:id`` or ``msdata:rowOrder``). When this option is enabled,
+                element name prefixes are stripped where resolvable; if a prefix isn't declared on the row tag element,
                 it cannot be resolved and will be left intact (i.e. this setting is ignored for that element).
                 For example, for the following XML data with a row tag ``abc:def``:
                 ```
diff --git a/tests/integ/test_xml_reader_row_tag.py b/tests/integ/test_xml_reader_row_tag.py
@@ -98,6 +98,12 @@ def setup(session, resources_path, local_testing_mode):
         test_files.test_xml_undeclared_namespace,
         compress=False,
     )
+    Utils.upload_to_stage(
+        session,
+        "@" + tmp_stage_name,
+        test_files.test_xml_undeclared_attr_namespace,
+        compress=False,
+    )
     Utils.upload_to_stage(
         session,
         "@" + tmp_stage_name,
@@ -296,6 +302,29 @@ def test_read_xml_undeclared_namespace(session, ignore_namespace):
     assert result[1]["'px:value'"] in ['"100"', '"200"']
 
 
+@pytest.mark.parametrize("ignore_namespace", [True, False])
+def test_read_xml_undeclared_attr_namespace(session, ignore_namespace):
+    # File has attribute prefixes (e.g., diffgr:id, msdata:rowOrder) declared only on ancestors.
+    # Reader extracts <Results> ... </Results> records without the declarations; parsing must still succeed.
+    row_tag = "Results"
+    df = (
+        session.read.option("rowTag", row_tag)
+        .option("cacheResult", False)
+        .option("mode", "failfast")
+        .option("ignoreNamespace", ignore_namespace)
+        .xml(f"@{tmp_stage_name}/undeclared_attr_namespace.xml")
+    )
+    if not ignore_namespace:
+        with pytest.raises(SnowparkSQLException, match="XMLSyntaxError"):
+            df.collect()
+    else:
+        result = df.collect()
+        assert len(result) == 3
+        noms = {result[0]["'NOM'"], result[1]["'NOM'"], result[2]["'NOM'"]}
+        assert '"CAMUT"' in noms
+        assert any(v in noms for v in ['"CAMUT"', '"Test2"', '"Test3"'])
+
+
 @pytest.mark.parametrize("attribute_prefix", ["_", ""])
 def test_read_xml_attribute_prefix(session, attribute_prefix):
     row_tag = "book"
diff --git a/tests/resources/undeclared_attr_namespace.xml b/tests/resources/undeclared_attr_namespace.xml
@@ -0,0 +1,36 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<diffgr:diffgram xmlns:msdata="urn:schemas-microsoft-com:xml-msdata" xmlns:diffgr="urn:schemas-microsoft-com:xml-diffgram-v1">
+  <DocumentElement xmlns="">
+    <Results diffgr:id="Results1" msdata:rowOrder="0">
+      <DATECREATION>201301021116</DATECREATION>
+      <IDCATEGORIE>2</IDCATEGORIE>
+      <IDMODEINSCRIPTION>P</IDMODEINSCRIPTION>
+      <PASSWORD>exp</PASSWORD>
+      <IDCONTACT>1000</IDCONTACT>
+      <NOM>CAMUT</NOM>
+      <PRENOM>Anthony</PRENOM>
+      <SOCIETE>TECH-EVENT INC.</SOCIETE>
+    </Results>
+    <Results diffgr:id="Results2" msdata:rowOrder="1">
+      <DATECREATION>201212041605</DATECREATION>
+      <IDCATEGORIE>ADMIN</IDCATEGORIE>
+      <IDMODEINSCRIPTION>P</IDMODEINSCRIPTION>
+      <PASSWORD>test</PASSWORD>
+      <IDCONTACT>600208</IDCONTACT>
+      <NOM>CAMUT</NOM>
+      <PRENOM>Anthony</PRENOM>
+      <SOCIETE>TECH-EVENT</SOCIETE>
+    </Results>
+    <Results diffgr:id="Results3" msdata:rowOrder="2">
+      <DATECREATION>201212071210</DATECREATION>
+      <IDCATEGORIE>VECCLI</IDCATEGORIE>
+      <IDMODEINSCRIPTION>P</IDMODEINSCRIPTION>
+      <IDCONTACT>600241</IDCONTACT>
+      <NOM>Test3</NOM>
+      <PRENOM>Alpha</PRENOM>
+      <SOCIETE>Example Corp</SOCIETE>
+    </Results>
+  </DocumentElement>
+</diffgr:diffgram>
+
+
diff --git a/tests/unit/scala/test_utils_suite.py b/tests/unit/scala/test_utils_suite.py
@@ -385,6 +385,7 @@ def check_zip_files_and_close_stream(input_stream, expected_files):
                 "resources/test_udtf_dir/test_vectorized_udtf.py",
                 "resources/test_udaf_dir/",
                 "resources/test_udaf_dir/test_udaf_file.py",
+                "resources/undeclared_attr_namespace.xml",
                 "resources/undeclared_namespace.xml",
                 "resources/xxe.xml",
             ],
diff --git a/tests/utils.py b/tests/utils.py
@@ -1701,6 +1701,10 @@ def test_xml_declared_namespace(self):
     def test_xml_undeclared_namespace(self):
         return os.path.join(self.resources_path, "undeclared_namespace.xml")
 
+    @property
+    def test_xml_undeclared_attr_namespace(self):
+        return os.path.join(self.resources_path, "undeclared_attr_namespace.xml")
+
     @property
     def test_null_value_xml(self):
         return os.path.join(self.resources_path, "null_value.xml")