1313
1414DEFAULT_CHUNK_SIZE : int = 1024
1515VARIANT_COLUMN_SIZE_LIMIT : int = 16 * 1024 * 1024
16- COLUMN_NAME_OF_CORRUPT_RECORD = "columnNameOfCorruptRecord"
1716
1817
1918def replace_entity (match : re .Match ) -> str :
@@ -300,6 +299,7 @@ def process_xml_range(
300299 approx_start : int ,
301300 approx_end : int ,
302301 mode : str ,
302+ column_name_of_corrupt_record : str ,
303303 chunk_size : int = DEFAULT_CHUNK_SIZE ,
304304) -> Iterator [Optional [Dict [str , Any ]]]:
305305 """
@@ -320,6 +320,7 @@ def process_xml_range(
320320 approx_end (int): Approximate end byte position.
321321 mode (str): The mode for dealing with corrupt records.
322322 "PERMISSIVE", "DROPMALFORMED" and "FAILFAST" are supported.
323+ column_name_of_corrupt_record (str): The name of the column for corrupt records.
323324 chunk_size (int): Size of chunks to read.
324325
325326 Yields:
@@ -363,7 +364,7 @@ def process_xml_range(
363364 record_bytes = f .read (VARIANT_COLUMN_SIZE_LIMIT )
364365 record_str = record_bytes .decode ("utf-8" , errors = "replace" )
365366 record_str = re .sub (r"&(\w+);" , replace_entity , record_str )
366- yield {COLUMN_NAME_OF_CORRUPT_RECORD : record_str }
367+ yield {column_name_of_corrupt_record : record_str }
367368 elif mode == "FAILFAST" :
368369 raise EOFError (
369370 f"Malformed XML record at bytes { record_start } -EOF: { e } "
@@ -384,7 +385,7 @@ def process_xml_range(
384385 record_bytes = f .read (VARIANT_COLUMN_SIZE_LIMIT )
385386 record_str = record_bytes .decode ("utf-8" , errors = "replace" )
386387 record_str = re .sub (r"&(\w+);" , replace_entity , record_str )
387- yield {COLUMN_NAME_OF_CORRUPT_RECORD : record_str }
388+ yield {column_name_of_corrupt_record : record_str }
388389 elif mode == "FAILFAST" :
389390 raise EOFError (
390391 f"Malformed XML record at bytes { record_start } -EOF: { e } "
@@ -402,7 +403,7 @@ def process_xml_range(
402403 yield element_to_dict (strip_namespaces (element ))
403404 except ET .ParseError as e :
404405 if mode == "PERMISSIVE" :
405- yield {COLUMN_NAME_OF_CORRUPT_RECORD : record_str }
406+ yield {column_name_of_corrupt_record : record_str }
406407 elif mode == "FAILFAST" :
407408 raise RuntimeError (
408409 f"Malformed XML record at bytes { record_start } -{ record_end } : { e } "
@@ -416,7 +417,15 @@ def process_xml_range(
416417
417418
418419class XMLReader :
419- def process (self , filename : str , num_workers : int , row_tag : str , i : int , mode : str ):
420+ def process (
421+ self ,
422+ filename : str ,
423+ num_workers : int ,
424+ row_tag : str ,
425+ i : int ,
426+ mode : str ,
427+ column_name_of_corrupt_record : str ,
428+ ):
420429 """
421430 Splits the file into byte ranges—one per worker—by starting with an even
422431 file size division and then moving each boundary to the end of a record,
@@ -429,12 +438,18 @@ def process(self, filename: str, num_workers: int, row_tag: str, i: int, mode: s
429438 i (int): The worker id.
430439 mode (str): The mode for dealing with corrupt records.
431440 "PERMISSIVE", "DROPMALFORMED" and "FAILFAST" are supported.
441+ column_name_of_corrupt_record (str): The name of the column for corrupt records.
432442 """
433443 file_size = get_file_size (filename )
434444 approx_chunk_size = file_size // num_workers
435445 approx_start = approx_chunk_size * i
436446 approx_end = approx_chunk_size * (i + 1 ) if i < num_workers - 1 else file_size
437447 for element in process_xml_range (
438- filename , row_tag , approx_start , approx_end , mode
448+ filename ,
449+ row_tag ,
450+ approx_start ,
451+ approx_end ,
452+ mode ,
453+ column_name_of_corrupt_record ,
439454 ):
440455 yield (element ,)
0 commit comments