Skip to content

Commit 6716d50

Browse files
author
Ilyas Gasanov
committed
[DOP-21976] Add compression options for XML
1 parent beecd7a commit 6716d50

File tree

8 files changed

+26
-5
lines changed

8 files changed

+26
-5
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add compression options to XML file format

syncmaster/dto/transfers.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,15 +36,20 @@ class FileTransferDTO(TransferDTO):
3636

3737
def __post_init__(self):
3838
if isinstance(self.file_format, dict):
39-
self.file_format = self._get_format(self.file_format.copy())
39+
self.file_format = self._get_file_format(self.file_format.copy())
4040
if isinstance(self.df_schema, str):
4141
self.df_schema = json.loads(self.df_schema)
4242

43-
def _get_format(self, file_format: dict):
43+
def _get_file_format(self, file_format: dict) -> CSV | JSONLine | JSON | Excel | XML | ORC | Parquet:
4444
file_type = file_format.pop("type", None)
45+
# XML at spark-xml has no "none" option https://github.com/databricks/spark-xml?tab=readme-ov-file#features
46+
if file_type == "xml" and file_format.get("compression") == "none":
47+
file_format.pop("compression")
48+
4549
parser_class = self._format_parsers.get(file_type)
4650
if parser_class is not None:
4751
return parser_class.parse_obj(file_format)
52+
4853
raise ValueError(f"Unknown file type: {file_type}")
4954

5055

syncmaster/schemas/v1/transfers/file_format.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,14 @@ class CSVCompression(str, Enum):
4949
DEFLATE = "deflate"
5050

5151

52+
class XMLCompression(str, Enum):
53+
NONE = "none"
54+
BZIP2 = "bzip2"
55+
GZIP = "gzip"
56+
LZ4 = "lz4"
57+
SNAPPY = "snappy"
58+
59+
5260
class CSV(BaseModel):
5361
type: CSV_FORMAT
5462
delimiter: str = ","
@@ -84,6 +92,7 @@ class XML(BaseModel):
8492
type: XML_FORMAT
8593
root_tag: str
8694
row_tag: str
95+
compression: XMLCompression = XMLCompression.GZIP
8796

8897

8998
class ORC(BaseModel):

tests/test_integration/test_run_transfer/test_hdfs.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -229,8 +229,8 @@ async def test_run_transfer_hdfs_to_postgres(
229229
id="parquet",
230230
),
231231
pytest.param(
232-
("xml", {}),
233-
"without_compression",
232+
("xml", {"compression": "snappy"}),
233+
"with_compression",
234234
id="xml",
235235
),
236236
],

tests/test_integration/test_run_transfer/test_s3.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ async def test_run_transfer_s3_to_postgres(
230230
id="parquet",
231231
),
232232
pytest.param(
233-
("xml", {}),
233+
("xml", {"compression": "none"}),
234234
"without_compression",
235235
id="xml",
236236
),

tests/test_unit/test_transfers/test_file_transfers/test_create_transfer.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@
5959
"type": "xml",
6060
"root_tag": "data",
6161
"row_tag": "record",
62+
"compression": "lz4",
6263
},
6364
"options": {
6465
"some": "option",
@@ -166,6 +167,7 @@ async def test_developer_plus_can_create_s3_transfer(
166167
"type": "xml",
167168
"root_tag": "data",
168169
"row_tag": "record",
170+
"compression": "lz4",
169171
},
170172
"orc": {
171173
"type": "orc",
@@ -221,6 +223,7 @@ async def test_developer_plus_can_create_s3_transfer(
221223
"type": "xml",
222224
"root_tag": "data",
223225
"row_tag": "record",
226+
"compression": "bzip2",
224227
},
225228
},
226229
{
@@ -320,6 +323,7 @@ async def test_developer_plus_can_create_hdfs_transfer(
320323
"type": "xml",
321324
"root_tag": "data",
322325
"row_tag": "record",
326+
"compression": "bzip2",
323327
},
324328
"orc": {
325329
"type": "orc",

tests/test_unit/test_transfers/test_file_transfers/test_read_transfer.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
"type": "xml",
4242
"root_tag": "data",
4343
"row_tag": "record",
44+
"compression": "bzip2",
4445
},
4546
"options": {},
4647
},

tests/test_unit/test_transfers/test_file_transfers/test_update_transfer.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
"type": "xml",
4242
"root_tag": "data",
4343
"row_tag": "record",
44+
"compression": "bzip2",
4445
},
4546
"options": {},
4647
},

0 commit comments

Comments
 (0)