Skip to content

Commit 4b174e9

Browse files
authored
SNOW-2403463: Fix xml reader with undeclared namespace (#3869)
1 parent c004dc4 commit 4b174e9

File tree

7 files changed

+90
-2
lines changed

7 files changed

+90
-2
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444

4545
#### Bug Fixes
4646

47+
- Fixed a bug that `DataFrameReader.xml` fails to parse XML files with undeclared namespaces when `ignoreNamespace` is `True`.
4748
- Added a fix for floating point precision discrepancies in `interval_day_time_from_parts`.
4849
- Fixed a bug where writing Snowpark pandas dataframes on the pandas backend with a column multiindex to Snowflake with `to_snowflake` would raise `KeyError`.
4950
- Fixed a bug that `DataFrameReader.dbapi` (PuPr) is not compatible with oracledb 3.4.0.

src/snowflake/snowpark/_internal/xml_reader.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -468,7 +468,22 @@ def process_xml_range(
468468
# to parse undeclared namespaces, we have to use recover mode
469469
recover = bool(":" in tag_name)
470470
parser = ET.XMLParser(recover=recover, ns_clean=True)
471-
element = ET.fromstring(record_str, parser)
471+
try:
472+
element = ET.fromstring(record_str, parser)
473+
except ET.XMLSyntaxError:
474+
# when ignoring namespaces, strip attribute prefixes
475+
# like xyz:id -> id so records with undeclared prefixes can still parse.
476+
if ignore_namespace:
477+
try:
478+
cleaned_record = re.sub(
479+
r"\s+(\w+):(\w+)=", r" \2=", record_str
480+
)
481+
element = ET.fromstring(cleaned_record, parser)
482+
except Exception as inner_ex:
483+
# avoid chained exceptions
484+
raise inner_ex from None
485+
else:
486+
raise
472487
else:
473488
element = ET.fromstring(record_str)
474489

src/snowflake/snowpark/dataframe_reader.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1031,7 +1031,9 @@ def xml(self, path: str, _emit_ast: bool = True) -> DataFrame:
10311031
The default value is '_corrupt_record'.
10321032
10331033
+ ``ignoreNamespace``: remove namespace prefixes from XML element names when constructing result column names.
1034-
The default value is ``True``. Note that a given prefix isn't declared on the row tag element,
1034+
The default value is ``True``. Parsing uses recovery mode to tolerate malformed records (e.g., undefined
1035+
namespace prefixes in attributes such as ``diffgr:id`` or ``msdata:rowOrder``). When this option is enabled,
1036+
element name prefixes are stripped where resolvable; if a prefix isn't declared on the row tag element,
10351037
it cannot be resolved and will be left intact (i.e. this setting is ignored for that element).
10361038
For example, for the following XML data with a row tag ``abc:def``:
10371039
```

tests/integ/test_xml_reader_row_tag.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,12 @@ def setup(session, resources_path, local_testing_mode):
9898
test_files.test_xml_undeclared_namespace,
9999
compress=False,
100100
)
101+
Utils.upload_to_stage(
102+
session,
103+
"@" + tmp_stage_name,
104+
test_files.test_xml_undeclared_attr_namespace,
105+
compress=False,
106+
)
101107
Utils.upload_to_stage(
102108
session,
103109
"@" + tmp_stage_name,
@@ -296,6 +302,29 @@ def test_read_xml_undeclared_namespace(session, ignore_namespace):
296302
assert result[1]["'px:value'"] in ['"100"', '"200"']
297303

298304

305+
@pytest.mark.parametrize("ignore_namespace", [True, False])
306+
def test_read_xml_undeclared_attr_namespace(session, ignore_namespace):
307+
# File has attribute prefixes (e.g., diffgr:id, msdata:rowOrder) declared only on ancestors.
308+
# Reader extracts <Results> ... </Results> records without the declarations; parsing must still succeed.
309+
row_tag = "Results"
310+
df = (
311+
session.read.option("rowTag", row_tag)
312+
.option("cacheResult", False)
313+
.option("mode", "failfast")
314+
.option("ignoreNamespace", ignore_namespace)
315+
.xml(f"@{tmp_stage_name}/undeclared_attr_namespace.xml")
316+
)
317+
if not ignore_namespace:
318+
with pytest.raises(SnowparkSQLException, match="XMLSyntaxError"):
319+
df.collect()
320+
else:
321+
result = df.collect()
322+
assert len(result) == 3
323+
noms = {result[0]["'NOM'"], result[1]["'NOM'"], result[2]["'NOM'"]}
324+
assert '"CAMUT"' in noms
325+
assert any(v in noms for v in ['"CAMUT"', '"Test2"', '"Test3"'])
326+
327+
299328
@pytest.mark.parametrize("attribute_prefix", ["_", ""])
300329
def test_read_xml_attribute_prefix(session, attribute_prefix):
301330
row_tag = "book"
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<diffgr:diffgram xmlns:msdata="urn:schemas-microsoft-com:xml-msdata" xmlns:diffgr="urn:schemas-microsoft-com:xml-diffgram-v1">
3+
<DocumentElement xmlns="">
4+
<Results diffgr:id="Results1" msdata:rowOrder="0">
5+
<DATECREATION>201301021116</DATECREATION>
6+
<IDCATEGORIE>2</IDCATEGORIE>
7+
<IDMODEINSCRIPTION>P</IDMODEINSCRIPTION>
8+
<PASSWORD>exp</PASSWORD>
9+
<IDCONTACT>1000</IDCONTACT>
10+
<NOM>CAMUT</NOM>
11+
<PRENOM>Anthony</PRENOM>
12+
<SOCIETE>TECH-EVENT INC.</SOCIETE>
13+
</Results>
14+
<Results diffgr:id="Results2" msdata:rowOrder="1">
15+
<DATECREATION>201212041605</DATECREATION>
16+
<IDCATEGORIE>ADMIN</IDCATEGORIE>
17+
<IDMODEINSCRIPTION>P</IDMODEINSCRIPTION>
18+
<PASSWORD>test</PASSWORD>
19+
<IDCONTACT>600208</IDCONTACT>
20+
<NOM>CAMUT</NOM>
21+
<PRENOM>Anthony</PRENOM>
22+
<SOCIETE>TECH-EVENT</SOCIETE>
23+
</Results>
24+
<Results diffgr:id="Results3" msdata:rowOrder="2">
25+
<DATECREATION>201212071210</DATECREATION>
26+
<IDCATEGORIE>VECCLI</IDCATEGORIE>
27+
<IDMODEINSCRIPTION>P</IDMODEINSCRIPTION>
28+
<IDCONTACT>600241</IDCONTACT>
29+
<NOM>Test3</NOM>
30+
<PRENOM>Alpha</PRENOM>
31+
<SOCIETE>Example Corp</SOCIETE>
32+
</Results>
33+
</DocumentElement>
34+
</diffgr:diffgram>
35+
36+

tests/unit/scala/test_utils_suite.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,7 @@ def check_zip_files_and_close_stream(input_stream, expected_files):
385385
"resources/test_udtf_dir/test_vectorized_udtf.py",
386386
"resources/test_udaf_dir/",
387387
"resources/test_udaf_dir/test_udaf_file.py",
388+
"resources/undeclared_attr_namespace.xml",
388389
"resources/undeclared_namespace.xml",
389390
"resources/xxe.xml",
390391
],

tests/utils.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1701,6 +1701,10 @@ def test_xml_declared_namespace(self):
17011701
def test_xml_undeclared_namespace(self):
17021702
return os.path.join(self.resources_path, "undeclared_namespace.xml")
17031703

1704+
@property
1705+
def test_xml_undeclared_attr_namespace(self):
1706+
return os.path.join(self.resources_path, "undeclared_attr_namespace.xml")
1707+
17041708
@property
17051709
def test_null_value_xml(self):
17061710
return os.path.join(self.resources_path, "null_value.xml")

0 commit comments

Comments
 (0)