column name

sfc-gh-jdu · sfc-gh-jdu · commit 95f279280c41 · 2025-05-07T11:29:52.000-07:00
diff --git a/src/snowflake/snowpark/_internal/analyzer/snowflake_plan.py b/src/snowflake/snowpark/_internal/analyzer/snowflake_plan.py
@@ -1374,6 +1374,9 @@ def _create_xml_query(
         xml_row_number_column_name = "XML_ROW_NUMBER"
         row_tag = options[XML_ROW_TAG_STRING]
         mode = options.get("MODE", "PERMISSIVE").upper()
+        column_name_of_corrupt_record = options.get(
+            "COLUMNNAMEOFCORRUPTRECORD", "_corrupt_record"
+        )
 
         if mode not in {"PERMISSIVE", "DROPMALFORMED", "FAILFAST"}:
             raise ValueError(
@@ -1402,6 +1405,7 @@ def _create_xml_query(
                 lit(row_tag),
                 col(worker_column_name),
                 lit(mode),
+                lit(column_name_of_corrupt_record),
             ),
         )
 
diff --git a/src/snowflake/snowpark/_internal/xml_reader.py b/src/snowflake/snowpark/_internal/xml_reader.py
@@ -13,7 +13,6 @@
 
 DEFAULT_CHUNK_SIZE: int = 1024
 VARIANT_COLUMN_SIZE_LIMIT: int = 16 * 1024 * 1024
-COLUMN_NAME_OF_CORRUPT_RECORD = "columnNameOfCorruptRecord"
 
 
 def replace_entity(match: re.Match) -> str:
@@ -300,6 +299,7 @@ def process_xml_range(
     approx_start: int,
     approx_end: int,
     mode: str,
+    column_name_of_corrupt_record: str,
     chunk_size: int = DEFAULT_CHUNK_SIZE,
 ) -> Iterator[Optional[Dict[str, Any]]]:
     """
@@ -320,6 +320,7 @@ def process_xml_range(
         approx_end (int): Approximate end byte position.
         mode (str): The mode for dealing with corrupt records.
             "PERMISSIVE", "DROPMALFORMED" and "FAILFAST" are supported.
+        column_name_of_corrupt_record (str): The name of the column for corrupt records.
         chunk_size (int): Size of chunks to read.
 
     Yields:
@@ -363,7 +364,7 @@ def process_xml_range(
                     record_bytes = f.read(VARIANT_COLUMN_SIZE_LIMIT)
                     record_str = record_bytes.decode("utf-8", errors="replace")
                     record_str = re.sub(r"&(\w+);", replace_entity, record_str)
-                    yield {COLUMN_NAME_OF_CORRUPT_RECORD: record_str}
+                    yield {column_name_of_corrupt_record: record_str}
                 elif mode == "FAILFAST":
                     raise EOFError(
                         f"Malformed XML record at bytes {record_start}-EOF: {e}"
@@ -384,7 +385,7 @@ def process_xml_range(
                         record_bytes = f.read(VARIANT_COLUMN_SIZE_LIMIT)
                         record_str = record_bytes.decode("utf-8", errors="replace")
                         record_str = re.sub(r"&(\w+);", replace_entity, record_str)
-                        yield {COLUMN_NAME_OF_CORRUPT_RECORD: record_str}
+                        yield {column_name_of_corrupt_record: record_str}
                     elif mode == "FAILFAST":
                         raise EOFError(
                             f"Malformed XML record at bytes {record_start}-EOF: {e}"
@@ -402,7 +403,7 @@ def process_xml_range(
                 yield element_to_dict(strip_namespaces(element))
             except ET.ParseError as e:
                 if mode == "PERMISSIVE":
-                    yield {COLUMN_NAME_OF_CORRUPT_RECORD: record_str}
+                    yield {column_name_of_corrupt_record: record_str}
                 elif mode == "FAILFAST":
                     raise RuntimeError(
                         f"Malformed XML record at bytes {record_start}-{record_end}: {e}"
@@ -416,7 +417,15 @@ def process_xml_range(
 
 
 class XMLReader:
-    def process(self, filename: str, num_workers: int, row_tag: str, i: int, mode: str):
+    def process(
+        self,
+        filename: str,
+        num_workers: int,
+        row_tag: str,
+        i: int,
+        mode: str,
+        column_name_of_corrupt_record: str,
+    ):
         """
         Splits the file into byte ranges—one per worker—by starting with an even
         file size division and then moving each boundary to the end of a record,
@@ -429,12 +438,18 @@ def process(self, filename: str, num_workers: int, row_tag: str, i: int, mode: s
             i (int): The worker id.
             mode (str): The mode for dealing with corrupt records.
                 "PERMISSIVE", "DROPMALFORMED" and "FAILFAST" are supported.
+            column_name_of_corrupt_record (str): The name of the column for corrupt records.
         """
         file_size = get_file_size(filename)
         approx_chunk_size = file_size // num_workers
         approx_start = approx_chunk_size * i
         approx_end = approx_chunk_size * (i + 1) if i < num_workers - 1 else file_size
         for element in process_xml_range(
-            filename, row_tag, approx_start, approx_end, mode
+            filename,
+            row_tag,
+            approx_start,
+            approx_end,
+            mode,
+            column_name_of_corrupt_record,
         ):
             yield (element,)
diff --git a/src/snowflake/snowpark/dataframe_reader.py b/src/snowflake/snowpark/dataframe_reader.py
@@ -864,13 +864,15 @@ def xml(self, path: str, _emit_ast: bool = True) -> DataFrame:
             - When ``rowTag`` is specified, the following options are supported for reading XML files
               via :meth:`option()` or :meth:`options()`:
 
-              + ``mode``: Specifies the mode of  for dealing with corrupt XML records. The default value is ``PERMISSIVE``. The supported values are:
+              + ``mode``: Specifies the mode for dealing with corrupt XML records. The default value is ``PERMISSIVE``. The supported values are:
 
                   - ``PERMISSIVE``: When it encounters a corrupt record, it sets all fields to null and includes a 'columnNameOfCorruptRecord' column.
 
                   - ``DROPMALFORMED``: Ignores the whole record that cannot be parsed correctly.
 
                   - ``FAILFAST``: When it encounters a corrupt record, it raises an exception immediately.
+              + ``columnNameOfCorruptRecord``: Specifies the name of the column that contains the corrupt record.
+                The default value is '_corrupt_record'.
         """
         df = self._read_semi_structured_file(path, "XML")
 
diff --git a/tests/integ/scala/test_dataframe_reader_suite.py b/tests/integ/scala/test_dataframe_reader_suite.py
@@ -2069,10 +2069,10 @@ def test_read_malformed_xml(session, file):
     )
     result = df.collect()
     assert len(result) == 2
-    assert len(result[0]) == 4  # has another column 'columnNameOfCorruptRecord'
+    assert len(result[0]) == 4  # has another column '_corrupt_record'
     assert (
-        result[0]["'columnNameOfCorruptRecord'"] is not None
-        or result[1]["'columnNameOfCorruptRecord'"] is not None
+        result[0]["'_corrupt_record'"] is not None
+        or result[1]["'_corrupt_record'"] is not None
     )
 
     # dropmalformed mode

Original file line number	Diff line number	Diff line change
`@@ -2069,10 +2069,10 @@ def test_read_malformed_xml(session, file):`
`2069`	`2069`	`)`
`2070`	`2070`	`result = df.collect()`
`2071`	`2071`	`assert len(result) == 2`
`2072`		`- assert len(result[0]) == 4 # has another column 'columnNameOfCorruptRecord'`
	`2072`	`+ assert len(result[0]) == 4 # has another column '_corrupt_record'`
`2073`	`2073`	`assert (`
`2074`		`- result[0]["'columnNameOfCorruptRecord'"] is not None`
`2075`		`- or result[1]["'columnNameOfCorruptRecord'"] is not None`
	`2074`	`+ result[0]["'_corrupt_record'"] is not None`
	`2075`	`+ or result[1]["'_corrupt_record'"] is not None`
`2076`	`2076`	`)`
`2077`	`2077`
`2078`	`2078`	`# dropmalformed mode`