MobileTeleSystems
diff --git a/‎docs/changelog/next_release/159.feature.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/changelog/next_release/159.feature.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎syncmaster/dto/transfers.py‎
Lines changed: 4 additions & 2 deletions b/‎syncmaster/dto/transfers.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎syncmaster/schemas/v1/transfers/file_format.py‎
Lines changed: 39 additions & 0 deletions b/‎syncmaster/schemas/v1/transfers/file_format.py‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎tests/resources/file_df_connection/generate_files.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/resources/file_df_connection/generate_files.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/resources/file_df_connection/orc/with_compression/file.snappy.orc‎
45 Bytes b/‎tests/resources/file_df_connection/orc/with_compression/file.snappy.orc‎
45 Bytes
diff --git a/‎tests/resources/file_df_connection/orc/without_compression/file.orc‎
45 Bytes b/‎tests/resources/file_df_connection/orc/without_compression/file.orc‎
45 Bytes
diff --git a/‎tests/resources/file_df_connection/parquet/with_compression/file.snappy.parquet‎
-650 Bytes b/‎tests/resources/file_df_connection/parquet/with_compression/file.snappy.parquet‎
-650 Bytes
diff --git a/‎tests/resources/file_df_connection/parquet/without_compression/file.parquet‎
-650 Bytes b/‎tests/resources/file_df_connection/parquet/without_compression/file.parquet‎
-650 Bytes
diff --git a/‎tests/test_integration/test_run_transfer/conftest.py‎
Lines changed: 23 additions & 3 deletions b/‎tests/test_integration/test_run_transfer/conftest.py‎
Lines changed: 23 additions & 3 deletions
diff --git a/‎tests/test_integration/test_run_transfer/test_hdfs.py‎
Lines changed: 25 additions & 4 deletions b/‎tests/test_integration/test_run_transfer/test_hdfs.py‎
Lines changed: 25 additions & 4 deletions
@@ -0,0 +1 @@
+Add compression options to file formats CSV, JSON, JSONLine, Excel, ORC, Parquet
@@ -4,7 +4,7 @@
 from dataclasses import dataclass
 from typing import ClassVar
 
-from onetl.file.format import CSV, JSON, Excel, JSONLine
+from onetl.file.format import CSV, JSON, ORC, Excel, JSONLine, Parquet
 
 
 @dataclass
@@ -20,7 +20,7 @@ class DBTransferDTO(TransferDTO):
 @dataclass
 class FileTransferDTO(TransferDTO):
     directory_path: str
-    file_format: CSV | JSONLine | JSON | Excel
+    file_format: CSV | JSONLine | JSON | Excel | ORC | Parquet
     options: dict
     df_schema: dict | None = None
 
@@ -29,6 +29,8 @@ class FileTransferDTO(TransferDTO):
         "jsonline": JSONLine,
         "json": JSON,
         "excel": Excel,
+        "orc": ORC,
+        "parquet": Parquet,
     }
 
     def __post_init__(self):
 
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+from enum import Enum
+
 from pydantic import BaseModel
 
 from syncmaster.schemas.v1.file_formats import (
@@ -15,6 +17,38 @@
 )
 
 
+class ORCCompression(str, Enum):
+    NONE = "none"
+    SNAPPY = "snappy"
+    ZLIB = "zlib"
+    LZ4 = "lz4"
+
+
+class ParquetCompression(str, Enum):
+    NONE = "none"
+    SNAPPY = "snappy"
+    GZIP = "gzip"
+    LZ4 = "lz4"
+
+
+class JSONCompression(str, Enum):
+    NONE = "none"
+    BZIP2 = "bzip2"
+    GZIP = "gzip"
+    LZ4 = "lz4"
+    SNAPPY = "snappy"
+    DEFLATE = "deflate"
+
+
+class CSVCompression(str, Enum):
+    NONE = "none"
+    BZIP2 = "bzip2"
+    GZIP = "gzip"
+    LZ4 = "lz4"
+    SNAPPY = "snappy"
+    DEFLATE = "deflate"
+
+
 class CSV(BaseModel):
     type: CSV_FORMAT
     delimiter: str = ","
@@ -23,18 +57,21 @@ class CSV(BaseModel):
     escape: str = "\\"
     include_header: bool = False
     line_sep: str = "\n"
+    compression: CSVCompression = CSVCompression.GZIP
 
 
 class JSONLine(BaseModel):
     type: JSONLINE_FORMAT
     encoding: str = "utf-8"
     line_sep: str = "\n"
+    compression: JSONCompression = JSONCompression.GZIP
 
 
 class JSON(BaseModel):
     type: JSON_FORMAT
     encoding: str = "utf-8"
     line_sep: str = "\n"
+    compression: JSONCompression = JSONCompression.GZIP
 
 
 class Excel(BaseModel):
@@ -51,7 +88,9 @@ class XML(BaseModel):
 
 class ORC(BaseModel):
     type: ORC_FORMAT
+    compression: ORCCompression = ORCCompression.ZLIB
 
 
 class Parquet(BaseModel):
     type: PARQUET_FORMAT
+    compression: ParquetCompression = ParquetCompression.SNAPPY
@@ -64,7 +64,7 @@ def get_pyarrow_schema() -> ArrowSchema:
             pa.field("REGION", pa.string()),
             pa.field("NUMBER", pa.int32()),
             pa.field("BIRTH_DATE", pa.date32()),
-            pa.field("REGISTERED_AT", pa.timestamp("ms")),
+            pa.field("REGISTERED_AT", pa.timestamp("us")),
             pa.field("ACCOUNT_BALANCE", pa.float64()),
         ],
     )
 
@@ -11,7 +11,7 @@
 from onetl.connection import MSSQL, Clickhouse, Hive, MySQL, Oracle, Postgres, SparkS3
 from onetl.connection.file_connection.s3 import S3
 from onetl.db import DBWriter
-from onetl.file.format import CSV, JSON, Excel, JSONLine
+from onetl.file.format import CSV, JSON, ORC, Excel, JSONLine, Parquet
 from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql.types import (
     DateType,
@@ -804,7 +804,7 @@ def fill_with_data(df: DataFrame):
         pass
 
 
-@pytest.fixture(params=[("csv", {}), ("jsonline", {}), ("json", {}), ("excel", {})])
+@pytest.fixture()
 def source_file_format(request: FixtureRequest):
     name, params = request.param
     if name == "csv":
@@ -835,10 +835,20 @@ def source_file_format(request: FixtureRequest):
             **params,
         )
 
+    if name == "orc":
+        return "orc", ORC(
+            **params,
+        )
+
+    if name == "parquet":
+        return "parquet", Parquet(
+            **params,
+        )
+
     raise ValueError(f"Unsupported file format: {name}")
 
 
-@pytest.fixture(params=[("csv", {}), ("jsonline", {}), ("excel", {})])
+@pytest.fixture()
 def target_file_format(request: FixtureRequest):
     name, params = request.param
     if name == "csv":
@@ -863,6 +873,16 @@ def target_file_format(request: FixtureRequest):
             **params,
         )
 
+    if name == "orc":
+        return "orc", ORC(
+            **params,
+        )
+
+    if name == "parquet":
+        return "parquet", Parquet(
+            **params,
+        )
+
     raise ValueError(f"Unsupported file format: {name}")
 
 
 
@@ -13,6 +13,7 @@
 from sqlalchemy.ext.asyncio import AsyncSession
 
 from syncmaster.db.models import Connection, Group, Queue, Status
+from syncmaster.db.models.transfer import Transfer
 from tests.mocks import MockUser
 from tests.test_unit.utils import create_transfer
 from tests.utils import get_run_on_end
@@ -128,6 +129,16 @@ async def postgres_to_hdfs(
             "with_header",
             id="excel",
         ),
+        pytest.param(
+            ("orc", {}),
+            "without_compression",
+            id="orc",
+        ),
+        pytest.param(
+            ("parquet", {}),
+            "without_compression",
+            id="parquet",
+        ),
     ],
     indirect=["source_file_format", "file_format_flavor"],
 )
@@ -136,7 +147,7 @@ async def test_run_transfer_hdfs_to_postgres(
     group_owner: MockUser,
     init_df: DataFrame,
     client: AsyncClient,
-    hdfs_to_postgres: Connection,
+    hdfs_to_postgres: Transfer,
     source_file_format,
     file_format_flavor,
 ):
@@ -188,8 +199,8 @@ async def test_run_transfer_hdfs_to_postgres(
     "target_file_format, file_format_flavor",
     [
         pytest.param(
-            ("csv", {}),
-            "with_header",
+            ("csv", {"compression": "lz4"}),
+            "with_compression",
             id="csv",
         ),
         pytest.param(
@@ -202,6 +213,16 @@ async def test_run_transfer_hdfs_to_postgres(
             "with_header",
             id="excel",
         ),
+        pytest.param(
+            ("orc", {"compression": "snappy"}),
+            "with_compression",
+            id="orc",
+        ),
+        pytest.param(
+            ("parquet", {"compression": "lz4"}),
+            "with_compression",
+            id="parquet",
+        ),
     ],
     indirect=["target_file_format", "file_format_flavor"],
 )
@@ -211,7 +232,7 @@ async def test_run_transfer_postgres_to_hdfs(
     client: AsyncClient,
     prepare_postgres,
     hdfs_file_df_connection: SparkHDFS,
-    postgres_to_hdfs: Connection,
+    postgres_to_hdfs: Transfer,
     hdfs_connection: SparkHDFS,
     target_file_format,
     file_format_flavor: str,
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Add compression options to file formats CSV, JSON, JSONLine, Excel, ORC, Parquet`
Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,7 @@ def get_pyarrow_schema() -> ArrowSchema:`
`64`	`64`	`pa.field("REGION", pa.string()),`
`65`	`65`	`pa.field("NUMBER", pa.int32()),`
`66`	`66`	`pa.field("BIRTH_DATE", pa.date32()),`
`67`		`- pa.field("REGISTERED_AT", pa.timestamp("ms")),`
	`67`	`+ pa.field("REGISTERED_AT", pa.timestamp("us")),`
`68`	`68`	`pa.field("ACCOUNT_BALANCE", pa.float64()),`
`69`	`69`	`],`
`70`	`70`	`)`