Skip to content

Commit 635610e

Browse files
author
Ilyas Gasanov
committed
[DOP-21450] Add compression options for file formats except xml
1 parent a8a4bac commit 635610e

File tree

12 files changed

+13
-29
lines changed

12 files changed

+13
-29
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
Add compression options to file formats
1+
Add compression options to file formats CSV, JSON, JSONLine, Excel, ORC, Parquet

syncmaster/schemas/v1/transfers/file_format.py

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,13 @@
1818

1919

2020
class ORCCompression(str, Enum):
21-
UNCOMPRESSED = "uncompressed"
2221
NONE = "none"
2322
SNAPPY = "snappy"
2423
ZLIB = "zlib"
2524
LZ4 = "lz4"
2625

2726

2827
class ParquetCompression(str, Enum):
29-
UNCOMPRESSED = "uncompressed"
3028
NONE = "none"
3129
SNAPPY = "snappy"
3230
GZIP = "gzip"
@@ -51,13 +49,6 @@ class CSVCompression(str, Enum):
5149
DEFLATE = "deflate"
5250

5351

54-
class XMLCompression(str, Enum):
55-
BZIP2 = "bzip2"
56-
GZIP = "gzip"
57-
LZ4 = "lz4"
58-
SNAPPY = "snappy"
59-
60-
6152
class CSV(BaseModel):
6253
type: CSV_FORMAT
6354
delimiter: str = ","
@@ -66,21 +57,21 @@ class CSV(BaseModel):
6657
escape: str = "\\"
6758
include_header: bool = False
6859
line_sep: str = "\n"
69-
compression: CSVCompression = CSVCompression.NONE
60+
compression: CSVCompression = CSVCompression.GZIP
7061

7162

7263
class JSONLine(BaseModel):
7364
type: JSONLINE_FORMAT
7465
encoding: str = "utf-8"
7566
line_sep: str = "\n"
76-
compression: JSONCompression = CSVCompression.NONE
67+
compression: JSONCompression = JSONCompression.GZIP
7768

7869

7970
class JSON(BaseModel):
8071
type: JSON_FORMAT
8172
encoding: str = "utf-8"
8273
line_sep: str = "\n"
83-
compression: JSONCompression = CSVCompression.NONE
74+
compression: JSONCompression = JSONCompression.GZIP
8475

8576

8677
class Excel(BaseModel):
@@ -93,14 +84,13 @@ class XML(BaseModel):
9384
type: XML_FORMAT
9485
root_tag: str
9586
row_tag: str
96-
compression: XMLCompression | None = None
9787

9888

9989
class ORC(BaseModel):
10090
type: ORC_FORMAT
101-
compression: ORCCompression = CSVCompression.NONE
91+
compression: ORCCompression = ORCCompression.ZLIB
10292

10393

10494
class Parquet(BaseModel):
10595
type: PARQUET_FORMAT
106-
compression: ParquetCompression = CSVCompression.NONE
96+
compression: ParquetCompression = ParquetCompression.SNAPPY

tests/resources/file_df_connection/generate_files.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def get_pyarrow_schema() -> ArrowSchema:
6464
pa.field("REGION", pa.string()),
6565
pa.field("NUMBER", pa.int32()),
6666
pa.field("BIRTH_DATE", pa.date32()),
67-
pa.field("REGISTERED_AT", pa.timestamp("ms")),
67+
pa.field("REGISTERED_AT", pa.timestamp("us")),
6868
pa.field("ACCOUNT_BALANCE", pa.float64()),
6969
],
7070
)
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

tests/test_integration/test_run_transfer/test_hdfs.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ async def test_run_transfer_hdfs_to_postgres(
185185
df = reader.run()
186186

187187
# as Excel does not support datetime values with precision greater than milliseconds
188-
if file_format in ("excel", "parquet", "orc"):
188+
if file_format == "excel":
189189
df = df.withColumn("REGISTERED_AT", date_trunc("second", col("REGISTERED_AT")))
190190
init_df = init_df.withColumn("REGISTERED_AT", date_trunc("second", col("REGISTERED_AT")))
191191

@@ -276,7 +276,7 @@ async def test_run_transfer_postgres_to_hdfs(
276276
df = reader.run()
277277

278278
# as Excel does not support datetime values with precision greater than milliseconds
279-
if format_name in ("excel", "parquet"):
279+
if format_name == "excel":
280280
init_df = init_df.withColumn(
281281
"REGISTERED_AT",
282282
to_timestamp(date_format(col("REGISTERED_AT"), "yyyy-MM-dd HH:mm:ss.SSS")),

tests/test_integration/test_run_transfer/test_s3.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ async def test_run_transfer_s3_to_postgres(
186186
df = reader.run()
187187

188188
# as Excel does not support datetime values with precision greater than milliseconds
189-
if file_format in ("excel", "parquet", "orc"):
189+
if file_format == "excel":
190190
df = df.withColumn("REGISTERED_AT", date_trunc("second", col("REGISTERED_AT")))
191191
init_df = init_df.withColumn("REGISTERED_AT", date_trunc("second", col("REGISTERED_AT")))
192192

@@ -277,7 +277,7 @@ async def test_run_transfer_postgres_to_s3(
277277
df = reader.run()
278278

279279
# as Excel does not support datetime values with precision greater than milliseconds
280-
if format_name in ("excel", "parquet"):
280+
if format_name == "excel":
281281
init_df = init_df.withColumn(
282282
"REGISTERED_AT",
283283
to_timestamp(date_format(col("REGISTERED_AT"), "yyyy-MM-dd HH:mm:ss.SSS")),

tests/test_unit/test_transfers/test_file_transfers/test_create_transfer.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,6 @@
5959
"type": "xml",
6060
"root_tag": "data",
6161
"row_tag": "record",
62-
"compression": "lz4",
6362
},
6463
"options": {
6564
"some": "option",
@@ -167,11 +166,10 @@ async def test_developer_plus_can_create_s3_transfer(
167166
"type": "xml",
168167
"root_tag": "data",
169168
"row_tag": "record",
170-
"compression": "lz4",
171169
},
172170
"orc": {
173171
"type": "orc",
174-
"compression": "none",
172+
"compression": "zlib",
175173
},
176174
"parquet": {
177175
"type": "parquet",
@@ -223,7 +221,6 @@ async def test_developer_plus_can_create_s3_transfer(
223221
"type": "xml",
224222
"root_tag": "data",
225223
"row_tag": "record",
226-
"compression": "bzip2",
227224
},
228225
},
229226
{
@@ -323,11 +320,10 @@ async def test_developer_plus_can_create_hdfs_transfer(
323320
"type": "xml",
324321
"root_tag": "data",
325322
"row_tag": "record",
326-
"compression": "bzip2",
327323
},
328324
"orc": {
329325
"type": "orc",
330-
"compression": "none",
326+
"compression": "zlib",
331327
},
332328
"parquet": {
333329
"type": "parquet",

0 commit comments

Comments
 (0)