add test and full logic

sfc-gh-yuwang · sfc-gh-yuwang · commit 1c84a9f6753a · 2025-12-22T15:19:51.000-08:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,8 @@
 
 #### New Features
 
+- Allow user input schema when reading XML file on stage.
+
 #### Bug Fixes
 
 #### Improvements
diff --git a/src/snowflake/snowpark/_internal/analyzer/snowflake_plan.py b/src/snowflake/snowpark/_internal/analyzer/snowflake_plan.py
@@ -1813,14 +1813,15 @@ def _create_xml_query(
         xml_reader_udtf: "UserDefinedTableFunction",
         file_path: str,
         options: Dict[str, str],
-        schema_string: str,
+        schema: List[Attribute],
     ) -> str:
         """
         Creates a DataFrame from a UserDefinedTableFunction that reads XML files.
         """
         from snowflake.snowpark.functions import lit, col, seq8, flatten
         from snowflake.snowpark._internal.xml_reader import DEFAULT_CHUNK_SIZE
 
+        schema_string = attribute_to_schema_string(schema) if schema is not None else ""
         worker_column_name = "WORKER"
         xml_row_number_column_name = "XML_ROW_NUMBER"
         row_tag = options[XML_ROW_TAG_STRING]
@@ -1908,10 +1909,9 @@ def read_file(
         source_plan: Optional[ReadFileNode] = None,
     ) -> SnowflakePlan:
         thread_safe_session_enabled = self.session._conn._thread_safe_session_enabled
-        schema_string = attribute_to_schema_string(schema)
         if xml_reader_udtf is not None:
             xml_query = self._create_xml_query(
-                xml_reader_udtf, path, options, schema_string if use_user_schema else ""
+                xml_reader_udtf, path, options, schema if use_user_schema else None
             )
             return SnowflakePlan(
                 [Query(xml_query)],
diff --git a/src/snowflake/snowpark/dataframe_reader.py b/src/snowflake/snowpark/dataframe_reader.py
@@ -19,6 +19,7 @@
     drop_file_format_if_exists_statement,
     infer_schema_statement,
     quote_name_without_upper_casing,
+    single_quote,
 )
 from snowflake.snowpark._internal.analyzer.expression import Attribute
 from snowflake.snowpark._internal.analyzer.snowflake_plan_node import ReadFileNode
@@ -630,10 +631,10 @@ def table(
 
     @publicapi
     def schema(self, schema: StructType, _emit_ast: bool = True) -> "DataFrameReader":
-        """Define the schema for CSV files that you want to read.
+        """Define the schema for CSV or XML files that you want to read.
 
         Args:
-            schema: Schema configuration for the CSV file to be read.
+            schema: Schema configuration for the CSV or XML file to be read.
 
         Returns:
             a :class:`DataFrameReader` instance with the specified schema configuration for the data to be read.
@@ -1069,7 +1070,17 @@ def xml(self, path: str, _emit_ast: bool = True) -> DataFrame:
             ast.reader.CopyFrom(self._ast)
             df._ast_id = stmt.uid
 
-        return df
+        # cast to input custom schema type
+        if self._user_schema:
+            cols = [
+                df[single_quote(field._name)]
+                .cast(field.datatype)
+                .alias(quote_name_without_upper_casing(field._name))
+                for field in self._user_schema.fields
+            ]
+            return df.select(cols)
+        else:
+            return df
 
     @publicapi
     def option(self, key: str, value: Any, _emit_ast: bool = True) -> "DataFrameReader":
diff --git a/tests/integ/test_xml_reader_row_tag.py b/tests/integ/test_xml_reader_row_tag.py
@@ -1,7 +1,7 @@
 #
 # Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
 #
-
+import datetime
 import logging
 import json
 import pytest
@@ -12,6 +12,13 @@
     SnowparkSQLException,
 )
 from snowflake.snowpark.functions import col, lit
+from snowflake.snowpark.types import (
+    StructType,
+    StructField,
+    StringType,
+    DoubleType,
+    DateType,
+)
 from tests.utils import TestFiles, Utils
 
 
@@ -467,3 +474,138 @@ def test_read_xml_row_validation_xsd_path_failfast(session):
         session.read.option("rowTag", row_tag).option(
             "rowValidationXSDPath", f"@{tmp_stage_name}/{test_file_books_xsd}"
         ).option("mode", "failfast").xml(f"@{tmp_stage_name}/{test_file_books_xml}")
+
+
+def test_read_xml_with_custom_schema(session):
+
+    # user input schema is missing description and adding 'extra_col',
+    # the output shall have the structure as input schema, which does not have description
+    # and have an 'extra_col' filled with null value
+    # the case of schema is also preserved
+    user_schema = StructType(
+        [
+            StructField("Author", StringType(), True),
+            StructField("Title", StringType(), True),
+            StructField("genre", StringType(), True),
+            StructField("PRICE", DoubleType(), True),
+            StructField("publish_Date", DateType(), True),
+            StructField("extra_col", StringType(), True),
+        ]
+    )
+    # case is preserved, same behavior as pyspark
+    expected_schema = StructType(
+        [
+            StructField('"Author"', StringType(), nullable=True),
+            StructField('"Title"', StringType(), nullable=True),
+            StructField('"genre"', StringType(), nullable=True),
+            StructField("PRICE", DoubleType(), nullable=True),
+            StructField('"publish_Date"', DateType(), nullable=True),
+            StructField('"extra_col"', StringType(), nullable=True),
+        ]
+    )
+
+    df = (
+        session.read.option("rowTag", "book")
+        .schema(user_schema)
+        .xml(f"@{tmp_stage_name}/{test_file_books_xml}")
+    )
+    expected_result = [
+        Row(
+            Author="Gambardella, Matthew",
+            Title="XML Developer's Guide",
+            genre="Computer",
+            PRICE=44.95,
+            publish_Date=datetime.date(2000, 10, 1),
+            extra_col=None,
+        ),
+        Row(
+            Author="Corets, Eva",
+            Title="Maeve Ascendant",
+            genre="Fantasy",
+            PRICE=5.95,
+            publish_Date=datetime.date(2000, 11, 17),
+            extra_col=None,
+        ),
+        Row(
+            Author="Kress, Peter",
+            Title="Paradox Lost",
+            genre="Science Fiction",
+            PRICE=6.95,
+            publish_Date=datetime.date(2000, 11, 2),
+            extra_col=None,
+        ),
+        Row(
+            Author="Ralls, Kim",
+            Title="Midnight Rain",
+            genre="Fantasy",
+            PRICE=5.95,
+            publish_Date=datetime.date(2000, 12, 16),
+            extra_col=None,
+        ),
+        Row(
+            Author="Knorr, Stefan",
+            Title="Creepy Crawlies",
+            genre="Horror",
+            PRICE=4.95,
+            publish_Date=datetime.date(2000, 12, 6),
+            extra_col=None,
+        ),
+        Row(
+            Author="Thurman, Paula",
+            Title="Splish Splash",
+            genre="Romance",
+            PRICE=4.95,
+            publish_Date=datetime.date(2000, 11, 2),
+            extra_col=None,
+        ),
+        Row(
+            Author="Randall, Cynthia",
+            Title="Lover Birds",
+            genre="Romance",
+            PRICE=4.95,
+            publish_Date=datetime.date(2000, 9, 2),
+            extra_col=None,
+        ),
+        Row(
+            Author="Corets, Eva",
+            Title="The Sundered Grail",
+            genre="Fantasy",
+            PRICE=5.95,
+            publish_Date=datetime.date(2001, 9, 10),
+            extra_col=None,
+        ),
+        Row(
+            Author="Corets, Eva",
+            Title="Oberon's Legacy",
+            genre="Fantasy",
+            PRICE=5.95,
+            publish_Date=datetime.date(2001, 3, 10),
+            extra_col=None,
+        ),
+        Row(
+            Author="O'Brien, Tim",
+            Title="Microsoft .NET: The Programming Bible",
+            genre="Computer",
+            PRICE=36.95,
+            publish_Date=datetime.date(2000, 12, 9),
+            extra_col=None,
+        ),
+        Row(
+            Author="O'Brien, Tim",
+            Title="MSXML3: A Comprehensive Guide",
+            genre="Computer",
+            PRICE=36.95,
+            publish_Date=datetime.date(2000, 12, 1),
+            extra_col=None,
+        ),
+        Row(
+            Author="Galos, Mike",
+            Title="Visual Studio 7: A Comprehensive Guide",
+            genre="Computer",
+            PRICE=49.95,
+            publish_Date=datetime.date(2001, 4, 16),
+            extra_col=None,
+        ),
+    ]
+    Utils.check_answer(df, expected_result)
+    assert df.schema == expected_schema
diff --git a/tests/unit/test_xml_reader.py b/tests/unit/test_xml_reader.py