Skip to content

Commit 1c84a9f

Browse files
committed
add test and full logic
1 parent 2bac9f6 commit 1c84a9f

File tree

5 files changed

+345
-106
lines changed

5 files changed

+345
-106
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
#### New Features
88

9+
- Allow user input schema when reading XML file on stage.
10+
911
#### Bug Fixes
1012

1113
#### Improvements

src/snowflake/snowpark/_internal/analyzer/snowflake_plan.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1813,14 +1813,15 @@ def _create_xml_query(
18131813
xml_reader_udtf: "UserDefinedTableFunction",
18141814
file_path: str,
18151815
options: Dict[str, str],
1816-
schema_string: str,
1816+
schema: List[Attribute],
18171817
) -> str:
18181818
"""
18191819
Creates a DataFrame from a UserDefinedTableFunction that reads XML files.
18201820
"""
18211821
from snowflake.snowpark.functions import lit, col, seq8, flatten
18221822
from snowflake.snowpark._internal.xml_reader import DEFAULT_CHUNK_SIZE
18231823

1824+
schema_string = attribute_to_schema_string(schema) if schema is not None else ""
18241825
worker_column_name = "WORKER"
18251826
xml_row_number_column_name = "XML_ROW_NUMBER"
18261827
row_tag = options[XML_ROW_TAG_STRING]
@@ -1908,10 +1909,9 @@ def read_file(
19081909
source_plan: Optional[ReadFileNode] = None,
19091910
) -> SnowflakePlan:
19101911
thread_safe_session_enabled = self.session._conn._thread_safe_session_enabled
1911-
schema_string = attribute_to_schema_string(schema)
19121912
if xml_reader_udtf is not None:
19131913
xml_query = self._create_xml_query(
1914-
xml_reader_udtf, path, options, schema_string if use_user_schema else ""
1914+
xml_reader_udtf, path, options, schema if use_user_schema else None
19151915
)
19161916
return SnowflakePlan(
19171917
[Query(xml_query)],

src/snowflake/snowpark/dataframe_reader.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
drop_file_format_if_exists_statement,
2020
infer_schema_statement,
2121
quote_name_without_upper_casing,
22+
single_quote,
2223
)
2324
from snowflake.snowpark._internal.analyzer.expression import Attribute
2425
from snowflake.snowpark._internal.analyzer.snowflake_plan_node import ReadFileNode
@@ -630,10 +631,10 @@ def table(
630631

631632
@publicapi
632633
def schema(self, schema: StructType, _emit_ast: bool = True) -> "DataFrameReader":
633-
"""Define the schema for CSV files that you want to read.
634+
"""Define the schema for CSV or XML files that you want to read.
634635
635636
Args:
636-
schema: Schema configuration for the CSV file to be read.
637+
schema: Schema configuration for the CSV or XML file to be read.
637638
638639
Returns:
639640
a :class:`DataFrameReader` instance with the specified schema configuration for the data to be read.
@@ -1069,7 +1070,17 @@ def xml(self, path: str, _emit_ast: bool = True) -> DataFrame:
10691070
ast.reader.CopyFrom(self._ast)
10701071
df._ast_id = stmt.uid
10711072

1072-
return df
1073+
# cast to input custom schema type
1074+
if self._user_schema:
1075+
cols = [
1076+
df[single_quote(field._name)]
1077+
.cast(field.datatype)
1078+
.alias(quote_name_without_upper_casing(field._name))
1079+
for field in self._user_schema.fields
1080+
]
1081+
return df.select(cols)
1082+
else:
1083+
return df
10731084

10741085
@publicapi
10751086
def option(self, key: str, value: Any, _emit_ast: bool = True) -> "DataFrameReader":

tests/integ/test_xml_reader_row_tag.py

Lines changed: 143 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#
22
# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
33
#
4-
4+
import datetime
55
import logging
66
import json
77
import pytest
@@ -12,6 +12,13 @@
1212
SnowparkSQLException,
1313
)
1414
from snowflake.snowpark.functions import col, lit
15+
from snowflake.snowpark.types import (
16+
StructType,
17+
StructField,
18+
StringType,
19+
DoubleType,
20+
DateType,
21+
)
1522
from tests.utils import TestFiles, Utils
1623

1724

@@ -467,3 +474,138 @@ def test_read_xml_row_validation_xsd_path_failfast(session):
467474
session.read.option("rowTag", row_tag).option(
468475
"rowValidationXSDPath", f"@{tmp_stage_name}/{test_file_books_xsd}"
469476
).option("mode", "failfast").xml(f"@{tmp_stage_name}/{test_file_books_xml}")
477+
478+
479+
def test_read_xml_with_custom_schema(session):
480+
481+
# user input schema is missing description and adding 'extra_col',
482+
# the output shall have the structure as input schema, which does not have description
483+
# and have an 'extra_col' filled with null value
484+
# the case of schema is also preserved
485+
user_schema = StructType(
486+
[
487+
StructField("Author", StringType(), True),
488+
StructField("Title", StringType(), True),
489+
StructField("genre", StringType(), True),
490+
StructField("PRICE", DoubleType(), True),
491+
StructField("publish_Date", DateType(), True),
492+
StructField("extra_col", StringType(), True),
493+
]
494+
)
495+
# case is preserved, same behavior as pyspark
496+
expected_schema = StructType(
497+
[
498+
StructField('"Author"', StringType(), nullable=True),
499+
StructField('"Title"', StringType(), nullable=True),
500+
StructField('"genre"', StringType(), nullable=True),
501+
StructField("PRICE", DoubleType(), nullable=True),
502+
StructField('"publish_Date"', DateType(), nullable=True),
503+
StructField('"extra_col"', StringType(), nullable=True),
504+
]
505+
)
506+
507+
df = (
508+
session.read.option("rowTag", "book")
509+
.schema(user_schema)
510+
.xml(f"@{tmp_stage_name}/{test_file_books_xml}")
511+
)
512+
expected_result = [
513+
Row(
514+
Author="Gambardella, Matthew",
515+
Title="XML Developer's Guide",
516+
genre="Computer",
517+
PRICE=44.95,
518+
publish_Date=datetime.date(2000, 10, 1),
519+
extra_col=None,
520+
),
521+
Row(
522+
Author="Corets, Eva",
523+
Title="Maeve Ascendant",
524+
genre="Fantasy",
525+
PRICE=5.95,
526+
publish_Date=datetime.date(2000, 11, 17),
527+
extra_col=None,
528+
),
529+
Row(
530+
Author="Kress, Peter",
531+
Title="Paradox Lost",
532+
genre="Science Fiction",
533+
PRICE=6.95,
534+
publish_Date=datetime.date(2000, 11, 2),
535+
extra_col=None,
536+
),
537+
Row(
538+
Author="Ralls, Kim",
539+
Title="Midnight Rain",
540+
genre="Fantasy",
541+
PRICE=5.95,
542+
publish_Date=datetime.date(2000, 12, 16),
543+
extra_col=None,
544+
),
545+
Row(
546+
Author="Knorr, Stefan",
547+
Title="Creepy Crawlies",
548+
genre="Horror",
549+
PRICE=4.95,
550+
publish_Date=datetime.date(2000, 12, 6),
551+
extra_col=None,
552+
),
553+
Row(
554+
Author="Thurman, Paula",
555+
Title="Splish Splash",
556+
genre="Romance",
557+
PRICE=4.95,
558+
publish_Date=datetime.date(2000, 11, 2),
559+
extra_col=None,
560+
),
561+
Row(
562+
Author="Randall, Cynthia",
563+
Title="Lover Birds",
564+
genre="Romance",
565+
PRICE=4.95,
566+
publish_Date=datetime.date(2000, 9, 2),
567+
extra_col=None,
568+
),
569+
Row(
570+
Author="Corets, Eva",
571+
Title="The Sundered Grail",
572+
genre="Fantasy",
573+
PRICE=5.95,
574+
publish_Date=datetime.date(2001, 9, 10),
575+
extra_col=None,
576+
),
577+
Row(
578+
Author="Corets, Eva",
579+
Title="Oberon's Legacy",
580+
genre="Fantasy",
581+
PRICE=5.95,
582+
publish_Date=datetime.date(2001, 3, 10),
583+
extra_col=None,
584+
),
585+
Row(
586+
Author="O'Brien, Tim",
587+
Title="Microsoft .NET: The Programming Bible",
588+
genre="Computer",
589+
PRICE=36.95,
590+
publish_Date=datetime.date(2000, 12, 9),
591+
extra_col=None,
592+
),
593+
Row(
594+
Author="O'Brien, Tim",
595+
Title="MSXML3: A Comprehensive Guide",
596+
genre="Computer",
597+
PRICE=36.95,
598+
publish_Date=datetime.date(2000, 12, 1),
599+
extra_col=None,
600+
),
601+
Row(
602+
Author="Galos, Mike",
603+
Title="Visual Studio 7: A Comprehensive Guide",
604+
genre="Computer",
605+
PRICE=49.95,
606+
publish_Date=datetime.date(2001, 4, 16),
607+
extra_col=None,
608+
),
609+
]
610+
Utils.check_answer(df, expected_result)
611+
assert df.schema == expected_schema

0 commit comments

Comments
 (0)