Skip to content

Commit 103fe63

Browse files
author
Ilyas Gasanov
committed
[DOP-21450] Add compressions for file formats
1 parent a8a4bac commit 103fe63

File tree

3 files changed

+8
-10
lines changed

3 files changed

+8
-10
lines changed

syncmaster/schemas/v1/transfers/file_format.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,13 @@
1818

1919

2020
class ORCCompression(str, Enum):
21-
UNCOMPRESSED = "uncompressed"
2221
NONE = "none"
2322
SNAPPY = "snappy"
2423
ZLIB = "zlib"
2524
LZ4 = "lz4"
2625

2726

2827
class ParquetCompression(str, Enum):
29-
UNCOMPRESSED = "uncompressed"
3028
NONE = "none"
3129
SNAPPY = "snappy"
3230
GZIP = "gzip"
@@ -66,21 +64,21 @@ class CSV(BaseModel):
6664
escape: str = "\\"
6765
include_header: bool = False
6866
line_sep: str = "\n"
69-
compression: CSVCompression = CSVCompression.NONE
67+
compression: CSVCompression = CSVCompression.GZIP
7068

7169

7270
class JSONLine(BaseModel):
7371
type: JSONLINE_FORMAT
7472
encoding: str = "utf-8"
7573
line_sep: str = "\n"
76-
compression: JSONCompression = CSVCompression.NONE
74+
compression: JSONCompression = JSONCompression.GZIP
7775

7876

7977
class JSON(BaseModel):
8078
type: JSON_FORMAT
8179
encoding: str = "utf-8"
8280
line_sep: str = "\n"
83-
compression: JSONCompression = CSVCompression.NONE
81+
compression: JSONCompression = JSONCompression.GZIP
8482

8583

8684
class Excel(BaseModel):
@@ -93,14 +91,14 @@ class XML(BaseModel):
9391
type: XML_FORMAT
9492
root_tag: str
9593
row_tag: str
96-
compression: XMLCompression | None = None
94+
compression: XMLCompression = XMLCompression.GZIP
9795

9896

9997
class ORC(BaseModel):
10098
type: ORC_FORMAT
101-
compression: ORCCompression = CSVCompression.NONE
99+
compression: ORCCompression = ORCCompression.ZLIB
102100

103101

104102
class Parquet(BaseModel):
105103
type: PARQUET_FORMAT
106-
compression: ParquetCompression = CSVCompression.NONE
104+
compression: ParquetCompression = ParquetCompression.SNAPPY

tests/test_integration/test_run_transfer/test_hdfs.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,7 @@ async def test_run_transfer_postgres_to_hdfs(
276276
df = reader.run()
277277

278278
# as Excel does not support datetime values with precision greater than milliseconds
279-
if format_name in ("excel", "parquet"):
279+
if format_name == "excel":
280280
init_df = init_df.withColumn(
281281
"REGISTERED_AT",
282282
to_timestamp(date_format(col("REGISTERED_AT"), "yyyy-MM-dd HH:mm:ss.SSS")),

tests/test_integration/test_run_transfer/test_s3.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,7 @@ async def test_run_transfer_postgres_to_s3(
277277
df = reader.run()
278278

279279
# as Excel does not support datetime values with precision greater than milliseconds
280-
if format_name in ("excel", "parquet"):
280+
if format_name == "excel":
281281
init_df = init_df.withColumn(
282282
"REGISTERED_AT",
283283
to_timestamp(date_format(col("REGISTERED_AT"), "yyyy-MM-dd HH:mm:ss.SSS")),

0 commit comments

Comments
 (0)