[DOP-21445] add orc, parquest tests

maxim-lixakov · maxim-lixakov · commit 5467e2357a94 · 2024-12-03T22:38:45.000+03:00
diff --git a/syncmaster/dto/transfers.py b/syncmaster/dto/transfers.py
@@ -4,7 +4,7 @@
 from dataclasses import dataclass
 from typing import ClassVar
 
-from onetl.file.format import CSV, JSON, XML, Excel, JSONLine
+from onetl.file.format import CSV, JSON, ORC, XML, Excel, JSONLine, Parquet
 
 
 @dataclass
@@ -30,6 +30,8 @@ class FileTransferDTO(TransferDTO):
         "json": JSON,
         "excel": Excel,
         "xml": XML,
+        "orc": ORC,
+        "parquet": Parquet,
     }
 
     def __post_init__(self):
diff --git a/syncmaster/worker/spark.py b/syncmaster/worker/spark.py
@@ -32,7 +32,6 @@ def get_worker_spark_session(
         log.debug("Enabling Hive support")
         spark_builder = spark_builder.enableHiveSupport()
 
-    spark_builder.sparkContext.setLogLevel("DEBUG")
     return spark_builder.getOrCreate()
 
 
diff --git a/tests/test_integration/test_run_transfer/conftest.py b/tests/test_integration/test_run_transfer/conftest.py
@@ -11,7 +11,7 @@
 from onetl.connection import MSSQL, Clickhouse, Hive, MySQL, Oracle, Postgres, SparkS3
 from onetl.connection.file_connection.s3 import S3
 from onetl.db import DBWriter
-from onetl.file.format import CSV, JSON, XML, Excel, JSONLine
+from onetl.file.format import CSV, JSON, ORC, XML, Excel, JSONLine, Parquet
 from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.types import (
     DateType,
@@ -119,6 +119,7 @@ def spark(settings: Settings, request: FixtureRequest) -> SparkSession:
         file_formats_spark_packages: list[str] = XML.get_packages(spark_version=spark_version) + Excel.get_packages(
             spark_version=spark_version,
         )
+
         maven_packages.extend(file_formats_spark_packages)
 
     if maven_packages:
@@ -808,7 +809,9 @@ def fill_with_data(df: DataFrame):
         pass
 
 
-@pytest.fixture(params=[("csv", {}), ("jsonline", {}), ("json", {}), ("excel", {}), ("xml", {})])
+@pytest.fixture(
+    params=[("csv", {}), ("jsonline", {}), ("json", {}), ("excel", {}), ("xml", {}), ("orc", {}), ("parquet", {})],
+)
 def source_file_format(request: FixtureRequest):
     name, params = request.param
     if name == "csv":
@@ -845,10 +848,20 @@ def source_file_format(request: FixtureRequest):
             **params,
         )
 
+    if name == "orc":
+        return "orc", ORC(
+            **params,
+        )
+
+    if name == "parquet":
+        return "parquet", Parquet(
+            **params,
+        )
+
     raise ValueError(f"Unsupported file format: {name}")
 
 
-@pytest.fixture(params=[("csv", {}), ("jsonline", {}), ("excel", {}), ("xml", {})])
+@pytest.fixture(params=[("csv", {}), ("jsonline", {}), ("excel", {}), ("xml", {}), ("orc", {}), ("parquet", {})])
 def target_file_format(request: FixtureRequest):
     name, params = request.param
     if name == "csv":
@@ -879,6 +892,16 @@ def target_file_format(request: FixtureRequest):
             **params,
         )
 
+    if name == "orc":
+        return "orc", ORC(
+            **params,
+        )
+
+    if name == "parquet":
+        return "parquet", Parquet(
+            **params,
+        )
+
     raise ValueError(f"Unsupported file format: {name}")
 
 
diff --git a/tests/test_integration/test_run_transfer/test_hdfs.py b/tests/test_integration/test_run_transfer/test_hdfs.py
@@ -133,6 +133,16 @@ async def postgres_to_hdfs(
             "without_compression",
             id="xml",
         ),
+        pytest.param(
+            ("orc", {}),
+            "without_compression",
+            id="orc",
+        ),
+        pytest.param(
+            ("parquet", {}),
+            "without_compression",
+            id="parquet",
+        ),
     ],
     indirect=["source_file_format", "file_format_flavor"],
 )
@@ -178,8 +188,8 @@ async def test_run_transfer_hdfs_to_postgres(
     )
     df = reader.run()
 
-    # as Excel does not support datetime values with precision greater than milliseconds
-    if file_format == "excel":
+    # as Excel and Parquet does not support datetime values with precision greater than milliseconds
+    if file_format in ("excel", "parquet"):
         df = df.withColumn("REGISTERED_AT", date_trunc("second", col("REGISTERED_AT")))
         init_df = init_df.withColumn("REGISTERED_AT", date_trunc("second", col("REGISTERED_AT")))
 
@@ -212,6 +222,16 @@ async def test_run_transfer_hdfs_to_postgres(
             "without_compression",
             id="xml",
         ),
+        pytest.param(
+            ("orc", {}),
+            "without_compression",
+            id="orc",
+        ),
+        pytest.param(
+            ("parquet", {}),
+            "without_compression",
+            id="parquet",
+        ),
     ],
     indirect=["target_file_format", "file_format_flavor"],
 )
@@ -264,8 +284,8 @@ async def test_run_transfer_postgres_to_hdfs(
     )
     df = reader.run()
 
-    # as Excel does not support datetime values with precision greater than milliseconds
-    if format_name == "excel":
+    # as Excel and Parquet does not support datetime values with precision greater than milliseconds
+    if format_name in ("excel", "parquet"):
         init_df = init_df.withColumn(
             "REGISTERED_AT",
             to_timestamp(date_format(col("REGISTERED_AT"), "yyyy-MM-dd HH:mm:ss.SSS")),
diff --git a/tests/test_integration/test_run_transfer/test_s3.py b/tests/test_integration/test_run_transfer/test_s3.py
@@ -133,6 +133,16 @@ async def postgres_to_s3(
             "without_compression",
             id="xml",
         ),
+        pytest.param(
+            ("orc", {}),
+            "without_compression",
+            id="orc",
+        ),
+        pytest.param(
+            ("parquet", {}),
+            "without_compression",
+            id="parquet",
+        ),
     ],
     indirect=["source_file_format", "file_format_flavor"],
 )
@@ -179,8 +189,8 @@ async def test_run_transfer_s3_to_postgres(
     )
     df = reader.run()
 
-    # as Excel does not support datetime values with precision greater than milliseconds
-    if file_format == "excel":
+    # as Excel and Parquet does not support datetime values with precision greater than milliseconds
+    if file_format in ("excel", "parquet"):
         df = df.withColumn("REGISTERED_AT", date_trunc("second", col("REGISTERED_AT")))
         init_df = init_df.withColumn("REGISTERED_AT", date_trunc("second", col("REGISTERED_AT")))
 
@@ -193,11 +203,6 @@ async def test_run_transfer_s3_to_postgres(
 @pytest.mark.parametrize(
     "target_file_format, file_format_flavor",
     [
-        pytest.param(
-            ("xml", {}),
-            "without_compression",
-            id="xml",
-        ),
         pytest.param(
             ("csv", {}),
             "with_header",
@@ -213,6 +218,21 @@ async def test_run_transfer_s3_to_postgres(
             "with_header",
             id="excel",
         ),
+        pytest.param(
+            ("xml", {}),
+            "without_compression",
+            id="xml",
+        ),
+        pytest.param(
+            ("orc", {}),
+            "without_compression",
+            id="orc",
+        ),
+        pytest.param(
+            ("parquet", {}),
+            "without_compression",
+            id="parquet",
+        ),
     ],
     indirect=["target_file_format", "file_format_flavor"],
 )
@@ -265,8 +285,8 @@ async def test_run_transfer_postgres_to_s3(
     )
     df = reader.run()
 
-    # as Excel does not support datetime values with precision greater than milliseconds
-    if format_name == "excel":
+    # as Excel and Parquet does not support datetime values with precision greater than milliseconds
+    if format_name in ("excel", "parquet"):
         init_df = init_df.withColumn(
             "REGISTERED_AT",
             to_timestamp(date_format(col("REGISTERED_AT"), "yyyy-MM-dd HH:mm:ss.SSS")),