[DOP-21450] Add compression options for file formats except xml

Ilyas Gasanov · Ilyas Gasanov · commit 635610e7703d · 2024-12-04T14:59:21.000+03:00
diff --git a/docs/changelog/next_release/159.feature.rst b/docs/changelog/next_release/159.feature.rst
@@ -1 +1 @@
-Add compression options to file formats
+Add compression options to file formats CSV, JSON, JSONLine, Excel, ORC, Parquet
diff --git a/syncmaster/schemas/v1/transfers/file_format.py b/syncmaster/schemas/v1/transfers/file_format.py
@@ -18,15 +18,13 @@
 
 
 class ORCCompression(str, Enum):
-    UNCOMPRESSED = "uncompressed"
     NONE = "none"
     SNAPPY = "snappy"
     ZLIB = "zlib"
     LZ4 = "lz4"
 
 
 class ParquetCompression(str, Enum):
-    UNCOMPRESSED = "uncompressed"
     NONE = "none"
     SNAPPY = "snappy"
     GZIP = "gzip"
@@ -51,13 +49,6 @@ class CSVCompression(str, Enum):
     DEFLATE = "deflate"
 
 
-class XMLCompression(str, Enum):
-    BZIP2 = "bzip2"
-    GZIP = "gzip"
-    LZ4 = "lz4"
-    SNAPPY = "snappy"
-
-
 class CSV(BaseModel):
     type: CSV_FORMAT
     delimiter: str = ","
@@ -66,21 +57,21 @@ class CSV(BaseModel):
     escape: str = "\\"
     include_header: bool = False
     line_sep: str = "\n"
-    compression: CSVCompression = CSVCompression.NONE
+    compression: CSVCompression = CSVCompression.GZIP
 
 
 class JSONLine(BaseModel):
     type: JSONLINE_FORMAT
     encoding: str = "utf-8"
     line_sep: str = "\n"
-    compression: JSONCompression = CSVCompression.NONE
+    compression: JSONCompression = JSONCompression.GZIP
 
 
 class JSON(BaseModel):
     type: JSON_FORMAT
     encoding: str = "utf-8"
     line_sep: str = "\n"
-    compression: JSONCompression = CSVCompression.NONE
+    compression: JSONCompression = JSONCompression.GZIP
 
 
 class Excel(BaseModel):
@@ -93,14 +84,13 @@ class XML(BaseModel):
     type: XML_FORMAT
     root_tag: str
     row_tag: str
-    compression: XMLCompression | None = None
 
 
 class ORC(BaseModel):
     type: ORC_FORMAT
-    compression: ORCCompression = CSVCompression.NONE
+    compression: ORCCompression = ORCCompression.ZLIB
 
 
 class Parquet(BaseModel):
     type: PARQUET_FORMAT
-    compression: ParquetCompression = CSVCompression.NONE
+    compression: ParquetCompression = ParquetCompression.SNAPPY
diff --git a/tests/resources/file_df_connection/generate_files.py b/tests/resources/file_df_connection/generate_files.py
@@ -64,7 +64,7 @@ def get_pyarrow_schema() -> ArrowSchema:
             pa.field("REGION", pa.string()),
             pa.field("NUMBER", pa.int32()),
             pa.field("BIRTH_DATE", pa.date32()),
-            pa.field("REGISTERED_AT", pa.timestamp("ms")),
+            pa.field("REGISTERED_AT", pa.timestamp("us")),
             pa.field("ACCOUNT_BALANCE", pa.float64()),
         ],
     )
diff --git a/tests/resources/file_df_connection/orc/with_compression/file.snappy.orc b/tests/resources/file_df_connection/orc/with_compression/file.snappy.orc
diff --git a/tests/resources/file_df_connection/orc/without_compression/file.orc b/tests/resources/file_df_connection/orc/without_compression/file.orc
diff --git a/tests/resources/file_df_connection/parquet/with_compression/file.snappy.parquet b/tests/resources/file_df_connection/parquet/with_compression/file.snappy.parquet
diff --git a/tests/resources/file_df_connection/parquet/without_compression/file.parquet b/tests/resources/file_df_connection/parquet/without_compression/file.parquet
diff --git a/tests/test_integration/test_run_transfer/test_hdfs.py b/tests/test_integration/test_run_transfer/test_hdfs.py
@@ -185,7 +185,7 @@ async def test_run_transfer_hdfs_to_postgres(
     df = reader.run()
 
     # as Excel does not support datetime values with precision greater than milliseconds
-    if file_format in ("excel", "parquet", "orc"):
+    if file_format == "excel":
         df = df.withColumn("REGISTERED_AT", date_trunc("second", col("REGISTERED_AT")))
         init_df = init_df.withColumn("REGISTERED_AT", date_trunc("second", col("REGISTERED_AT")))
 
@@ -276,7 +276,7 @@ async def test_run_transfer_postgres_to_hdfs(
     df = reader.run()
 
     # as Excel does not support datetime values with precision greater than milliseconds
-    if format_name in ("excel", "parquet"):
+    if format_name == "excel":
         init_df = init_df.withColumn(
             "REGISTERED_AT",
             to_timestamp(date_format(col("REGISTERED_AT"), "yyyy-MM-dd HH:mm:ss.SSS")),
diff --git a/tests/test_integration/test_run_transfer/test_s3.py b/tests/test_integration/test_run_transfer/test_s3.py
@@ -186,7 +186,7 @@ async def test_run_transfer_s3_to_postgres(
     df = reader.run()
 
     # as Excel does not support datetime values with precision greater than milliseconds
-    if file_format in ("excel", "parquet", "orc"):
+    if file_format == "excel":
         df = df.withColumn("REGISTERED_AT", date_trunc("second", col("REGISTERED_AT")))
         init_df = init_df.withColumn("REGISTERED_AT", date_trunc("second", col("REGISTERED_AT")))
 
@@ -277,7 +277,7 @@ async def test_run_transfer_postgres_to_s3(
     df = reader.run()
 
     # as Excel does not support datetime values with precision greater than milliseconds
-    if format_name in ("excel", "parquet"):
+    if format_name == "excel":
         init_df = init_df.withColumn(
             "REGISTERED_AT",
             to_timestamp(date_format(col("REGISTERED_AT"), "yyyy-MM-dd HH:mm:ss.SSS")),
diff --git a/tests/test_unit/test_transfers/test_file_transfers/test_create_transfer.py b/tests/test_unit/test_transfers/test_file_transfers/test_create_transfer.py
@@ -59,7 +59,6 @@
                 "type": "xml",
                 "root_tag": "data",
                 "row_tag": "record",
-                "compression": "lz4",
             },
             "options": {
                 "some": "option",
@@ -167,11 +166,10 @@ async def test_developer_plus_can_create_s3_transfer(
             "type": "xml",
             "root_tag": "data",
             "row_tag": "record",
-            "compression": "lz4",
         },
         "orc": {
             "type": "orc",
-            "compression": "none",
+            "compression": "zlib",
         },
         "parquet": {
             "type": "parquet",
@@ -223,7 +221,6 @@ async def test_developer_plus_can_create_s3_transfer(
                 "type": "xml",
                 "root_tag": "data",
                 "row_tag": "record",
-                "compression": "bzip2",
             },
         },
         {
@@ -323,11 +320,10 @@ async def test_developer_plus_can_create_hdfs_transfer(
             "type": "xml",
             "root_tag": "data",
             "row_tag": "record",
-            "compression": "bzip2",
         },
         "orc": {
             "type": "orc",
-            "compression": "none",
+            "compression": "zlib",
         },
         "parquet": {
             "type": "parquet",
diff --git a/tests/test_unit/test_transfers/test_file_transfers/test_read_transfer.py b/tests/test_unit/test_transfers/test_file_transfers/test_read_transfer.py
@@ -41,7 +41,6 @@
                 "type": "xml",
                 "root_tag": "data",
                 "row_tag": "record",
-                "compression": "bzip2",
             },
             "options": {},
         },
diff --git a/tests/test_unit/test_transfers/test_file_transfers/test_update_transfer.py b/tests/test_unit/test_transfers/test_file_transfers/test_update_transfer.py
@@ -41,7 +41,6 @@
                 "type": "xml",
                 "root_tag": "data",
                 "row_tag": "record",
-                "compression": "bzip2",
             },
             "options": {},
         },

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-Add compression options to file formats`
	`1`	`+Add compression options to file formats CSV, JSON, JSONLine, Excel, ORC, Parquet`
Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,7 @@ def get_pyarrow_schema() -> ArrowSchema:`
`64`	`64`	`pa.field("REGION", pa.string()),`
`65`	`65`	`pa.field("NUMBER", pa.int32()),`
`66`	`66`	`pa.field("BIRTH_DATE", pa.date32()),`
`67`		`- pa.field("REGISTERED_AT", pa.timestamp("ms")),`
	`67`	`+ pa.field("REGISTERED_AT", pa.timestamp("us")),`
`68`	`68`	`pa.field("ACCOUNT_BALANCE", pa.float64()),`
`69`	`69`	`],`
`70`	`70`	`)`