55import os
66import re
77import html .entities
8- import logging
98import struct
109import xml .etree .ElementTree as ET
1110from typing import Optional , Dict , Any , Iterator , BinaryIO , Union , Tuple
1211from snowflake .snowpark .files import SnowflakeFile
1312
1413
1514DEFAULT_CHUNK_SIZE : int = 1024
15+ VARIANT_COLUMN_SIZE_LIMIT : int = 16 * 1024 * 1024
1616
1717
1818def replace_entity (match : re .Match ) -> str :
@@ -76,7 +76,7 @@ def tag_is_self_closing(
7676 chunk_start_pos = file_obj .tell ()
7777 chunk = file_obj .read (chunk_size )
7878 if not chunk :
79- raise EOFError ("EOF reached before end of opening tag" )
79+ raise EOFError ("Reached end of file but the tag is not closed " )
8080
8181 for idx , b in enumerate (struct .unpack (f"{ len (chunk )} c" , chunk )):
8282 # '>' inside quote should not be considered as the end of the tag
@@ -216,7 +216,7 @@ def find_next_opening_tag_pos(
216216 # Calculate the absolute position. Note that `data` starts at (current_pos - len(overlap)).
217217 absolute_pos = current_pos + pos - len (overlap )
218218 if absolute_pos >= end_limit :
219- raise EOFError ("Found tag beyond end limit" )
219+ raise EOFError ("Exceeded end limit before finding opening tag " )
220220 file_obj .seek (absolute_pos )
221221 return absolute_pos
222222
@@ -298,6 +298,8 @@ def process_xml_range(
298298 tag_name : str ,
299299 approx_start : int ,
300300 approx_end : int ,
301+ mode : str ,
302+ column_name_of_corrupt_record : str ,
301303 chunk_size : int = DEFAULT_CHUNK_SIZE ,
302304) -> Iterator [Optional [Dict [str , Any ]]]:
303305 """
@@ -316,6 +318,9 @@ def process_xml_range(
316318 tag_name (str): The tag that delimits records (e.g., "row").
317319 approx_start (int): Approximate start byte position.
318320 approx_end (int): Approximate end byte position.
321+ mode (str): The mode for dealing with corrupt records.
322+ "PERMISSIVE", "DROPMALFORMED" and "FAILFAST" are supported.
323+ column_name_of_corrupt_record (str): The name of the column for corrupt records.
319324 chunk_size (int): Size of chunks to read.
320325
321326 Yields:
@@ -351,8 +356,19 @@ def process_xml_range(
351356 # decide whether the row element is self‑closing
352357 try :
353358 is_self_close , tag_end = tag_is_self_closing (f )
354- except EOFError :
355- # malformed XML record
359+ # encountering an EOFError means the XML record isn't self-closing or
360+ # doesn't have a closing tag after reaching the end of the file
361+ except EOFError as e :
362+ if mode == "PERMISSIVE" :
363+ # read util the end of file or util variant column size limit
364+ record_bytes = f .read (VARIANT_COLUMN_SIZE_LIMIT )
365+ record_str = record_bytes .decode ("utf-8" , errors = "replace" )
366+ record_str = re .sub (r"&(\w+);" , replace_entity , record_str )
367+ yield {column_name_of_corrupt_record : record_str }
368+ elif mode == "FAILFAST" :
369+ raise EOFError (
370+ f"Malformed XML record at bytes { record_start } -EOF: { e } "
371+ ) from e
356372 break
357373
358374 if is_self_close :
@@ -361,31 +377,37 @@ def process_xml_range(
361377 f .seek (tag_end )
362378 try :
363379 record_end = find_next_closing_tag_pos (f , closing_tag , chunk_size )
364- except EOFError :
365- # incomplete XML record
380+ # encountering an EOFError means the XML record isn't self-closing or
381+ # doesn't have a closing tag after reaching the end of the file
382+ except EOFError as e :
383+ if mode == "PERMISSIVE" :
384+ # read util the end of file or util variant column size limit
385+ record_bytes = f .read (VARIANT_COLUMN_SIZE_LIMIT )
386+ record_str = record_bytes .decode ("utf-8" , errors = "replace" )
387+ record_str = re .sub (r"&(\w+);" , replace_entity , record_str )
388+ yield {column_name_of_corrupt_record : record_str }
389+ elif mode == "FAILFAST" :
390+ raise EOFError (
391+ f"Malformed XML record at bytes { record_start } -EOF: { e } "
392+ ) from e
366393 break
367394
368395 # Read the complete XML record.
369396 f .seek (record_start )
370397 record_bytes = f .read (record_end - record_start )
371- try :
372- record_str = record_bytes .decode ("utf-8" )
373- record_str = re .sub (r"&(\w+);" , replace_entity , record_str )
374- except UnicodeDecodeError as e :
375- logging .warning (
376- f"Unicode decode error at bytes { record_start } -{ record_end } : { e } "
377- )
378- f .seek (record_end )
379- continue
398+ record_str = record_bytes .decode ("utf-8" , errors = "replace" )
399+ record_str = re .sub (r"&(\w+);" , replace_entity , record_str )
380400
381401 try :
382402 element = ET .fromstring (record_str )
383403 yield element_to_dict (strip_namespaces (element ))
384404 except ET .ParseError as e :
385- logging .warning (
386- f"XML parse error at bytes { record_start } -{ record_end } : { e } "
387- )
388- logging .warning (f"Record content: { record_str } " )
405+ if mode == "PERMISSIVE" :
406+ yield {column_name_of_corrupt_record : record_str }
407+ elif mode == "FAILFAST" :
408+ raise RuntimeError (
409+ f"Malformed XML record at bytes { record_start } -{ record_end } : { e } "
410+ )
389411
390412 if record_end > approx_end :
391413 break
@@ -395,7 +417,15 @@ def process_xml_range(
395417
396418
397419class XMLReader :
398- def process (self , filename : str , num_workers : int , row_tag : str , i : int ):
420+ def process (
421+ self ,
422+ filename : str ,
423+ num_workers : int ,
424+ row_tag : str ,
425+ i : int ,
426+ mode : str ,
427+ column_name_of_corrupt_record : str ,
428+ ):
399429 """
400430 Splits the file into byte ranges—one per worker—by starting with an even
401431 file size division and then moving each boundary to the end of a record,
@@ -406,10 +436,20 @@ def process(self, filename: str, num_workers: int, row_tag: str, i: int):
406436 num_workers (int): Number of workers/chunks.
407437 row_tag (str): The tag name that delimits records (e.g., "row").
408438 i (int): The worker id.
439+ mode (str): The mode for dealing with corrupt records.
440+ "PERMISSIVE", "DROPMALFORMED" and "FAILFAST" are supported.
441+ column_name_of_corrupt_record (str): The name of the column for corrupt records.
409442 """
410443 file_size = get_file_size (filename )
411444 approx_chunk_size = file_size // num_workers
412445 approx_start = approx_chunk_size * i
413446 approx_end = approx_chunk_size * (i + 1 ) if i < num_workers - 1 else file_size
414- for element in process_xml_range (filename , row_tag , approx_start , approx_end ):
447+ for element in process_xml_range (
448+ filename ,
449+ row_tag ,
450+ approx_start ,
451+ approx_end ,
452+ mode ,
453+ column_name_of_corrupt_record ,
454+ ):
415455 yield (element ,)
0 commit comments