|
1 | 1 | # |
2 | 2 | # Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved. |
3 | 3 | # |
4 | | - |
| 4 | +import datetime |
5 | 5 | import logging |
6 | 6 | import json |
7 | 7 | import pytest |
|
12 | 12 | SnowparkSQLException, |
13 | 13 | ) |
14 | 14 | from snowflake.snowpark.functions import col, lit |
| 15 | +from snowflake.snowpark.types import ( |
| 16 | + StructType, |
| 17 | + StructField, |
| 18 | + StringType, |
| 19 | + DoubleType, |
| 20 | + DateType, |
| 21 | +) |
15 | 22 | from tests.utils import TestFiles, Utils |
16 | 23 |
|
17 | 24 |
|
@@ -467,3 +474,138 @@ def test_read_xml_row_validation_xsd_path_failfast(session): |
467 | 474 | session.read.option("rowTag", row_tag).option( |
468 | 475 | "rowValidationXSDPath", f"@{tmp_stage_name}/{test_file_books_xsd}" |
469 | 476 | ).option("mode", "failfast").xml(f"@{tmp_stage_name}/{test_file_books_xml}") |
| 477 | + |
| 478 | + |
| 479 | +def test_read_xml_with_custom_schema(session): |
| 480 | + |
| 481 | + # user input schema is missing description and adding 'extra_col', |
| 482 | + # the output shall have the structure as input schema, which does not have description |
| 483 | + # and have an 'extra_col' filled with null value |
| 484 | + # the case of schema is also preserved |
| 485 | + user_schema = StructType( |
| 486 | + [ |
| 487 | + StructField("Author", StringType(), True), |
| 488 | + StructField("Title", StringType(), True), |
| 489 | + StructField("genre", StringType(), True), |
| 490 | + StructField("PRICE", DoubleType(), True), |
| 491 | + StructField("publish_Date", DateType(), True), |
| 492 | + StructField("extra_col", StringType(), True), |
| 493 | + ] |
| 494 | + ) |
| 495 | + # case is preserved, same behavior as pyspark |
| 496 | + expected_schema = StructType( |
| 497 | + [ |
| 498 | + StructField('"Author"', StringType(), nullable=True), |
| 499 | + StructField('"Title"', StringType(), nullable=True), |
| 500 | + StructField('"genre"', StringType(), nullable=True), |
| 501 | + StructField("PRICE", DoubleType(), nullable=True), |
| 502 | + StructField('"publish_Date"', DateType(), nullable=True), |
| 503 | + StructField('"extra_col"', StringType(), nullable=True), |
| 504 | + ] |
| 505 | + ) |
| 506 | + |
| 507 | + df = ( |
| 508 | + session.read.option("rowTag", "book") |
| 509 | + .schema(user_schema) |
| 510 | + .xml(f"@{tmp_stage_name}/{test_file_books_xml}") |
| 511 | + ) |
| 512 | + expected_result = [ |
| 513 | + Row( |
| 514 | + Author="Gambardella, Matthew", |
| 515 | + Title="XML Developer's Guide", |
| 516 | + genre="Computer", |
| 517 | + PRICE=44.95, |
| 518 | + publish_Date=datetime.date(2000, 10, 1), |
| 519 | + extra_col=None, |
| 520 | + ), |
| 521 | + Row( |
| 522 | + Author="Corets, Eva", |
| 523 | + Title="Maeve Ascendant", |
| 524 | + genre="Fantasy", |
| 525 | + PRICE=5.95, |
| 526 | + publish_Date=datetime.date(2000, 11, 17), |
| 527 | + extra_col=None, |
| 528 | + ), |
| 529 | + Row( |
| 530 | + Author="Kress, Peter", |
| 531 | + Title="Paradox Lost", |
| 532 | + genre="Science Fiction", |
| 533 | + PRICE=6.95, |
| 534 | + publish_Date=datetime.date(2000, 11, 2), |
| 535 | + extra_col=None, |
| 536 | + ), |
| 537 | + Row( |
| 538 | + Author="Ralls, Kim", |
| 539 | + Title="Midnight Rain", |
| 540 | + genre="Fantasy", |
| 541 | + PRICE=5.95, |
| 542 | + publish_Date=datetime.date(2000, 12, 16), |
| 543 | + extra_col=None, |
| 544 | + ), |
| 545 | + Row( |
| 546 | + Author="Knorr, Stefan", |
| 547 | + Title="Creepy Crawlies", |
| 548 | + genre="Horror", |
| 549 | + PRICE=4.95, |
| 550 | + publish_Date=datetime.date(2000, 12, 6), |
| 551 | + extra_col=None, |
| 552 | + ), |
| 553 | + Row( |
| 554 | + Author="Thurman, Paula", |
| 555 | + Title="Splish Splash", |
| 556 | + genre="Romance", |
| 557 | + PRICE=4.95, |
| 558 | + publish_Date=datetime.date(2000, 11, 2), |
| 559 | + extra_col=None, |
| 560 | + ), |
| 561 | + Row( |
| 562 | + Author="Randall, Cynthia", |
| 563 | + Title="Lover Birds", |
| 564 | + genre="Romance", |
| 565 | + PRICE=4.95, |
| 566 | + publish_Date=datetime.date(2000, 9, 2), |
| 567 | + extra_col=None, |
| 568 | + ), |
| 569 | + Row( |
| 570 | + Author="Corets, Eva", |
| 571 | + Title="The Sundered Grail", |
| 572 | + genre="Fantasy", |
| 573 | + PRICE=5.95, |
| 574 | + publish_Date=datetime.date(2001, 9, 10), |
| 575 | + extra_col=None, |
| 576 | + ), |
| 577 | + Row( |
| 578 | + Author="Corets, Eva", |
| 579 | + Title="Oberon's Legacy", |
| 580 | + genre="Fantasy", |
| 581 | + PRICE=5.95, |
| 582 | + publish_Date=datetime.date(2001, 3, 10), |
| 583 | + extra_col=None, |
| 584 | + ), |
| 585 | + Row( |
| 586 | + Author="O'Brien, Tim", |
| 587 | + Title="Microsoft .NET: The Programming Bible", |
| 588 | + genre="Computer", |
| 589 | + PRICE=36.95, |
| 590 | + publish_Date=datetime.date(2000, 12, 9), |
| 591 | + extra_col=None, |
| 592 | + ), |
| 593 | + Row( |
| 594 | + Author="O'Brien, Tim", |
| 595 | + Title="MSXML3: A Comprehensive Guide", |
| 596 | + genre="Computer", |
| 597 | + PRICE=36.95, |
| 598 | + publish_Date=datetime.date(2000, 12, 1), |
| 599 | + extra_col=None, |
| 600 | + ), |
| 601 | + Row( |
| 602 | + Author="Galos, Mike", |
| 603 | + Title="Visual Studio 7: A Comprehensive Guide", |
| 604 | + genre="Computer", |
| 605 | + PRICE=49.95, |
| 606 | + publish_Date=datetime.date(2001, 4, 16), |
| 607 | + extra_col=None, |
| 608 | + ), |
| 609 | + ] |
| 610 | + Utils.check_answer(df, expected_result) |
| 611 | + assert df.schema == expected_schema |
0 commit comments