Skip to content

Commit 1c8f5bd

Browse files
author
Ilyas Gasanov
committed
[DOP-21450] Add compressions for file formats
1 parent a5d7e3b commit 1c8f5bd

File tree

4 files changed

+15
-20
lines changed

4 files changed

+15
-20
lines changed

syncmaster/schemas/v1/transfers/file_format.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,6 @@ class ORCCompression(str, Enum):
2222
NONE = "none"
2323
SNAPPY = "snappy"
2424
ZLIB = "zlib"
25-
LZO = "lzo"
26-
ZSTD = "zstd"
2725
LZ4 = "lz4"
2826

2927

@@ -32,10 +30,7 @@ class ParquetCompression(str, Enum):
3230
NONE = "none"
3331
SNAPPY = "snappy"
3432
GZIP = "gzip"
35-
LZO = "lzo"
36-
ZSTD = "zstd"
3733
LZ4 = "lz4"
38-
BROTLI = "brotli"
3934

4035

4136
class JSONCompression(str, Enum):

tests/test_integration/test_run_transfer/test_hdfs.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -184,8 +184,8 @@ async def test_run_transfer_hdfs_to_postgres(
184184
)
185185
df = reader.run()
186186

187-
# as Excel does not support datetime values with precision greater than milliseconds
188-
if file_format == "excel":
187+
# as these file formats do not support datetime values with precision greater than milliseconds
188+
if file_format in ("excel", "parquet", "orc"):
189189
df = df.withColumn("REGISTERED_AT", date_trunc("second", col("REGISTERED_AT")))
190190
init_df = init_df.withColumn("REGISTERED_AT", date_trunc("second", col("REGISTERED_AT")))
191191

@@ -214,12 +214,12 @@ async def test_run_transfer_hdfs_to_postgres(
214214
id="excel",
215215
),
216216
pytest.param(
217-
("orc", {"compression": "lzo"}),
217+
("orc", {"compression": "snappy"}),
218218
"with_compression",
219219
id="orc",
220220
),
221221
pytest.param(
222-
("parquet", {"compression": "brotli"}),
222+
("parquet", {"compression": "lz4"}),
223223
"with_compression",
224224
id="parquet",
225225
),
@@ -275,8 +275,8 @@ async def test_run_transfer_postgres_to_hdfs(
275275
)
276276
df = reader.run()
277277

278-
# as Excel does not support datetime values with precision greater than milliseconds
279-
if format_name == "excel":
278+
# as these file formats do not support datetime values with precision greater than milliseconds
279+
if format_name in ("excel", "parquet", "orc"):
280280
init_df = init_df.withColumn(
281281
"REGISTERED_AT",
282282
to_timestamp(date_format(col("REGISTERED_AT"), "yyyy-MM-dd HH:mm:ss.SSS")),

tests/test_integration/test_run_transfer/test_s3.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -185,8 +185,8 @@ async def test_run_transfer_s3_to_postgres(
185185
)
186186
df = reader.run()
187187

188-
# as Excel does not support datetime values with precision greater than milliseconds
189-
if file_format == "excel":
188+
# as these file formats do not support datetime values with precision greater than milliseconds
189+
if file_format in ("excel", "parquet", "orc"):
190190
df = df.withColumn("REGISTERED_AT", date_trunc("second", col("REGISTERED_AT")))
191191
init_df = init_df.withColumn("REGISTERED_AT", date_trunc("second", col("REGISTERED_AT")))
192192

@@ -220,7 +220,7 @@ async def test_run_transfer_s3_to_postgres(
220220
id="orc",
221221
),
222222
pytest.param(
223-
("parquet", {"compression": "brotli"}),
223+
("parquet", {"compression": "gzip"}),
224224
"with_compression",
225225
id="parquet",
226226
),
@@ -276,8 +276,8 @@ async def test_run_transfer_postgres_to_s3(
276276
)
277277
df = reader.run()
278278

279-
# as Excel does not support datetime values with precision greater than milliseconds
280-
if format_name == "excel":
279+
# as these file formats do not support datetime values with precision greater than milliseconds
280+
if format_name in ("excel", "parquet", "orc"):
281281
init_df = init_df.withColumn(
282282
"REGISTERED_AT",
283283
to_timestamp(date_format(col("REGISTERED_AT"), "yyyy-MM-dd HH:mm:ss.SSS")),

tests/test_unit/test_transfers/test_file_transfers/test_create_transfer.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@
8080
"directory_path": "/some/parquet/path",
8181
"file_format": {
8282
"type": "parquet",
83-
"compression": "zstd",
83+
"compression": "gzip",
8484
},
8585
"options": {
8686
"some": "option",
@@ -175,7 +175,7 @@ async def test_developer_plus_can_create_s3_transfer(
175175
},
176176
"parquet": {
177177
"type": "parquet",
178-
"compression": "zstd",
178+
"compression": "gzip",
179179
},
180180
}
181181

@@ -238,7 +238,7 @@ async def test_developer_plus_can_create_s3_transfer(
238238
"directory_path": "/some/parquet/path",
239239
"file_format": {
240240
"type": "parquet",
241-
"compression": "brotli",
241+
"compression": "snappy",
242242
},
243243
},
244244
],
@@ -331,7 +331,7 @@ async def test_developer_plus_can_create_hdfs_transfer(
331331
},
332332
"parquet": {
333333
"type": "parquet",
334-
"compression": "brotli",
334+
"compression": "snappy",
335335
},
336336
}
337337

0 commit comments

Comments
 (0)