Skip to content

Commit 2c10950

Browse files
author
Ilyas Gasanov
committed
[DOP-21450] Add compressions for file formats
1 parent a8a4bac commit 2c10950

File tree

9 files changed

+14
-15
lines changed

9 files changed

+14
-15
lines changed

syncmaster/schemas/v1/transfers/file_format.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,13 @@
1818

1919

2020
class ORCCompression(str, Enum):
21-
UNCOMPRESSED = "uncompressed"
2221
NONE = "none"
2322
SNAPPY = "snappy"
2423
ZLIB = "zlib"
2524
LZ4 = "lz4"
2625

2726

2827
class ParquetCompression(str, Enum):
29-
UNCOMPRESSED = "uncompressed"
3028
NONE = "none"
3129
SNAPPY = "snappy"
3230
GZIP = "gzip"
@@ -52,6 +50,7 @@ class CSVCompression(str, Enum):
5250

5351

5452
class XMLCompression(str, Enum):
53+
NONE = "none"
5554
BZIP2 = "bzip2"
5655
GZIP = "gzip"
5756
LZ4 = "lz4"
@@ -66,21 +65,21 @@ class CSV(BaseModel):
6665
escape: str = "\\"
6766
include_header: bool = False
6867
line_sep: str = "\n"
69-
compression: CSVCompression = CSVCompression.NONE
68+
compression: CSVCompression = CSVCompression.GZIP
7069

7170

7271
class JSONLine(BaseModel):
7372
type: JSONLINE_FORMAT
7473
encoding: str = "utf-8"
7574
line_sep: str = "\n"
76-
compression: JSONCompression = CSVCompression.NONE
75+
compression: JSONCompression = JSONCompression.GZIP
7776

7877

7978
class JSON(BaseModel):
8079
type: JSON_FORMAT
8180
encoding: str = "utf-8"
8281
line_sep: str = "\n"
83-
compression: JSONCompression = CSVCompression.NONE
82+
compression: JSONCompression = JSONCompression.GZIP
8483

8584

8685
class Excel(BaseModel):
@@ -93,14 +92,14 @@ class XML(BaseModel):
9392
type: XML_FORMAT
9493
root_tag: str
9594
row_tag: str
96-
compression: XMLCompression | None = None
95+
compression: XMLCompression = XMLCompression.GZIP
9796

9897

9998
class ORC(BaseModel):
10099
type: ORC_FORMAT
101-
compression: ORCCompression = CSVCompression.NONE
100+
compression: ORCCompression = ORCCompression.ZLIB
102101

103102

104103
class Parquet(BaseModel):
105104
type: PARQUET_FORMAT
106-
compression: ParquetCompression = CSVCompression.NONE
105+
compression: ParquetCompression = ParquetCompression.SNAPPY

tests/resources/file_df_connection/generate_files.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def get_pyarrow_schema() -> ArrowSchema:
6464
pa.field("REGION", pa.string()),
6565
pa.field("NUMBER", pa.int32()),
6666
pa.field("BIRTH_DATE", pa.date32()),
67-
pa.field("REGISTERED_AT", pa.timestamp("ms")),
67+
pa.field("REGISTERED_AT", pa.timestamp("us")),
6868
pa.field("ACCOUNT_BALANCE", pa.float64()),
6969
],
7070
)
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

tests/test_integration/test_run_transfer/test_hdfs.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ async def test_run_transfer_hdfs_to_postgres(
185185
df = reader.run()
186186

187187
# as Excel does not support datetime values with precision greater than milliseconds
188-
if file_format in ("excel", "parquet", "orc"):
188+
if file_format == "excel":
189189
df = df.withColumn("REGISTERED_AT", date_trunc("second", col("REGISTERED_AT")))
190190
init_df = init_df.withColumn("REGISTERED_AT", date_trunc("second", col("REGISTERED_AT")))
191191

@@ -276,7 +276,7 @@ async def test_run_transfer_postgres_to_hdfs(
276276
df = reader.run()
277277

278278
# as Excel does not support datetime values with precision greater than milliseconds
279-
if format_name in ("excel", "parquet"):
279+
if format_name == "excel":
280280
init_df = init_df.withColumn(
281281
"REGISTERED_AT",
282282
to_timestamp(date_format(col("REGISTERED_AT"), "yyyy-MM-dd HH:mm:ss.SSS")),

tests/test_integration/test_run_transfer/test_s3.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ async def test_run_transfer_s3_to_postgres(
186186
df = reader.run()
187187

188188
# as Excel does not support datetime values with precision greater than milliseconds
189-
if file_format in ("excel", "parquet", "orc"):
189+
if file_format == "excel":
190190
df = df.withColumn("REGISTERED_AT", date_trunc("second", col("REGISTERED_AT")))
191191
init_df = init_df.withColumn("REGISTERED_AT", date_trunc("second", col("REGISTERED_AT")))
192192

@@ -277,7 +277,7 @@ async def test_run_transfer_postgres_to_s3(
277277
df = reader.run()
278278

279279
# as Excel does not support datetime values with precision greater than milliseconds
280-
if format_name in ("excel", "parquet"):
280+
if format_name == "excel":
281281
init_df = init_df.withColumn(
282282
"REGISTERED_AT",
283283
to_timestamp(date_format(col("REGISTERED_AT"), "yyyy-MM-dd HH:mm:ss.SSS")),

tests/test_unit/test_transfers/test_file_transfers/test_create_transfer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ async def test_developer_plus_can_create_s3_transfer(
171171
},
172172
"orc": {
173173
"type": "orc",
174-
"compression": "none",
174+
"compression": "zlib",
175175
},
176176
"parquet": {
177177
"type": "parquet",
@@ -327,7 +327,7 @@ async def test_developer_plus_can_create_hdfs_transfer(
327327
},
328328
"orc": {
329329
"type": "orc",
330-
"compression": "none",
330+
"compression": "zlib",
331331
},
332332
"parquet": {
333333
"type": "parquet",

0 commit comments

Comments
 (0)