[DOP-21445] Add XML integration tests (#158)

maxim-lixakov · web-flow · commit 66ea992f3c2a · 2024-12-04T17:19:16.000+03:00
diff --git a/docker-compose.test.yml b/docker-compose.test.yml
@@ -196,7 +196,7 @@ services:
       interval: 30s
       timeout: 5s
       retries: 3
-    profiles: [hive, hdfs, all]
+    profiles: [hive, hdfs, s3, all]
 
   keycloak:
     image: quay.io/keycloak/keycloak:latest
@@ -234,7 +234,8 @@ services:
       HIVE_METASTORE_DB_DRIVER: org.postgresql.Driver
       HIVE_METASTORE_DB_USER: test_hive
       HIVE_METASTORE_DB_PASSWORD: test_hive
-    profiles: [hive, hdfs, all]
+    # writing spark dataframe to s3 xml file fails without running hive metastore server
+    profiles: [hive, hdfs, s3, all]
 
 volumes:
   postgres_test_data:
diff --git a/syncmaster/dto/transfers.py b/syncmaster/dto/transfers.py
@@ -4,7 +4,7 @@
 from dataclasses import dataclass
 from typing import ClassVar
 
-from onetl.file.format import CSV, JSON, ORC, Excel, JSONLine, Parquet
+from onetl.file.format import CSV, JSON, ORC, XML, Excel, JSONLine, Parquet
 
 
 @dataclass
@@ -20,7 +20,7 @@ class DBTransferDTO(TransferDTO):
 @dataclass
 class FileTransferDTO(TransferDTO):
     directory_path: str
-    file_format: CSV | JSONLine | JSON | Excel | ORC | Parquet
+    file_format: CSV | JSONLine | JSON | Excel | XML | ORC | Parquet
     options: dict
     df_schema: dict | None = None
 
@@ -31,6 +31,7 @@ class FileTransferDTO(TransferDTO):
         "excel": Excel,
         "orc": ORC,
         "parquet": Parquet,
+        "xml": XML,
     }
 
     def __post_init__(self):
diff --git a/syncmaster/worker/handlers/file/s3.py b/syncmaster/worker/handlers/file/s3.py
@@ -35,7 +35,7 @@ def read(self) -> DataFrame:
         from pyspark.sql.types import StructType
 
         options = {}
-        if self.transfer_dto.file_format.__class__.__name__ == "Excel":
+        if self.transfer_dto.file_format.__class__.__name__ in ("Excel", "XML"):
             options = {"inferSchema": True}
 
         reader = FileDFReader(
diff --git a/syncmaster/worker/spark.py b/syncmaster/worker/spark.py
@@ -36,8 +36,14 @@ def get_worker_spark_session(
 
 
 def get_packages(db_type: str) -> list[str]:
+    import pyspark
     from onetl.connection import MSSQL, Clickhouse, MySQL, Oracle, Postgres, SparkS3
-    from onetl.file.format import Excel
+    from onetl.file.format import XML, Excel
+
+    # excel version is hardcoded due to https://github.com/nightscape/spark-excel/issues/902
+    file_formats_spark_packages: list[str] = XML.get_packages(spark_version=pyspark.__version__) + Excel.get_packages(
+        spark_version="3.5.1",
+    )
 
     if db_type == "postgres":
         return Postgres.get_packages()
@@ -54,11 +60,10 @@ def get_packages(db_type: str) -> list[str]:
         import pyspark
 
         spark_version = pyspark.__version__
-        # see supported versions from https://mvnrepository.com/artifact/com.crealytics/spark-excel
-        return SparkS3.get_packages(spark_version=spark_version) + Excel.get_packages(spark_version="3.5.1")
+        return SparkS3.get_packages(spark_version=spark_version) + file_formats_spark_packages
+
     if db_type == "hdfs":
-        # see supported versions from https://mvnrepository.com/artifact/com.crealytics/spark-excel
-        return Excel.get_packages(spark_version="3.5.1")
+        return file_formats_spark_packages
 
     # If the database type does not require downloading .jar packages
     return []
diff --git a/tests/test_integration/test_run_transfer/conftest.py b/tests/test_integration/test_run_transfer/conftest.py
@@ -11,7 +11,7 @@
 from onetl.connection import MSSQL, Clickhouse, Hive, MySQL, Oracle, Postgres, SparkS3
 from onetl.connection.file_connection.s3 import S3
 from onetl.db import DBWriter
-from onetl.file.format import CSV, JSON, ORC, Excel, JSONLine, Parquet
+from onetl.file.format import CSV, JSON, ORC, XML, Excel, JSONLine, Parquet
 from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.types import (
     DateType,
@@ -114,8 +114,11 @@ def spark(settings: Settings, request: FixtureRequest) -> SparkSession:
         )
 
     if "hdfs" in markers or "s3" in markers:
-        # see supported versions from https://mvnrepository.com/artifact/com.crealytics/spark-excel
-        maven_packages.extend(Excel.get_packages(spark_version="3.5.1"))
+        # excel version is hardcoded due to https://github.com/nightscape/spark-excel/issues/902
+        file_formats_spark_packages: list[str] = XML.get_packages(
+            spark_version=pyspark.__version__,
+        ) + Excel.get_packages(spark_version="3.5.1")
+        maven_packages.extend(file_formats_spark_packages)
 
     if maven_packages:
         spark = spark.config("spark.jars.packages", ",".join(maven_packages))
@@ -845,6 +848,12 @@ def source_file_format(request: FixtureRequest):
             **params,
         )
 
+    if name == "xml":
+        return "xml", XML(
+            row_tag="item",
+            **params,
+        )
+
     raise ValueError(f"Unsupported file format: {name}")
 
 
@@ -883,6 +892,12 @@ def target_file_format(request: FixtureRequest):
             **params,
         )
 
+    if name == "xml":
+        return "xml", XML(
+            row_tag="item",
+            **params,
+        )
+
     raise ValueError(f"Unsupported file format: {name}")
 
 
diff --git a/tests/test_integration/test_run_transfer/test_hdfs.py b/tests/test_integration/test_run_transfer/test_hdfs.py
@@ -139,6 +139,11 @@ async def postgres_to_hdfs(
             "without_compression",
             id="parquet",
         ),
+        pytest.param(
+            ("xml", {}),
+            "without_compression",
+            id="xml",
+        ),
     ],
     indirect=["source_file_format", "file_format_flavor"],
 )
@@ -223,6 +228,11 @@ async def test_run_transfer_hdfs_to_postgres(
             "with_compression",
             id="parquet",
         ),
+        pytest.param(
+            ("xml", {}),
+            "without_compression",
+            id="xml",
+        ),
     ],
     indirect=["target_file_format", "file_format_flavor"],
 )
diff --git a/tests/test_integration/test_run_transfer/test_s3.py b/tests/test_integration/test_run_transfer/test_s3.py
@@ -139,6 +139,11 @@ async def postgres_to_s3(
             "without_compression",
             id="parquet",
         ),
+        pytest.param(
+            ("xml", {}),
+            "without_compression",
+            id="xml",
+        ),
     ],
     indirect=["source_file_format", "file_format_flavor"],
 )
@@ -224,6 +229,11 @@ async def test_run_transfer_s3_to_postgres(
             "with_compression",
             id="parquet",
         ),
+        pytest.param(
+            ("xml", {}),
+            "without_compression",
+            id="xml",
+        ),
     ],
     indirect=["target_file_format", "file_format_flavor"],
 )