@@ -375,9 +375,11 @@ def _create_or_update_glue_table(
375375 )
376376 return False
377377
378- # Escape section_type to make it table-name-safe
379- escaped_section_type = re .sub (r"[/\\:*?\"<>|-]" , "_" , section_type .lower ())
380- table_name = f"document_sections_{ escaped_section_type } "
378+ # Escape section_type to make it table-name-safe and s3 prefix-safe
379+ # Note: we escape '-' in tablename but not in s3 prefix, only to provide backward compatability for data already stored.
380+ section_type_tablename = re .sub (r"[/\\:*?\"<>|-]" , "_" , section_type .lower ())
381+ section_type_prefix = re .sub (r"[/\\:*?\"<>|]" , "_" , section_type .lower ())
382+ table_name = f"document_sections_{ section_type_tablename } "
381383
382384 # Convert schema to Glue columns
383385 columns = self ._convert_schema_to_glue_columns (schema )
@@ -388,7 +390,7 @@ def _create_or_update_glue_table(
388390 "Description" : f"Document sections table for type: { section_type } " ,
389391 "StorageDescriptor" : {
390392 "Columns" : columns ,
391- "Location" : f"s3://{ self .reporting_bucket } /document_sections/{ escaped_section_type } /" ,
393+ "Location" : f"s3://{ self .reporting_bucket } /document_sections/{ section_type_prefix } /" ,
392394 "InputFormat" : "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat" ,
393395 "OutputFormat" : "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat" ,
394396 "Compressed" : True ,
@@ -407,7 +409,7 @@ def _create_or_update_glue_table(
407409 "projection.date.range" : "2024-01-01,2030-12-31" ,
408410 "projection.date.interval" : "1" ,
409411 "projection.date.interval.unit" : "DAYS" ,
410- "storage.location.template" : f"s3://{ self .reporting_bucket } /document_sections/{ escaped_section_type } /date=${{date}}/" ,
412+ "storage.location.template" : f"s3://{ self .reporting_bucket } /document_sections/{ section_type_prefix } /date=${{date}}/" ,
411413 },
412414 }
413415
@@ -1244,13 +1246,13 @@ def save_document_sections(self, document: Document) -> Optional[Dict[str, Any]]
12441246 section .classification if section .classification else "unknown"
12451247 )
12461248 # Escape section_type to make it filesystem-safe and lowercase for consistency
1247- escaped_section_type = re .sub (
1248- r"[/\\:*?\"<>|- ]" , "_" , section_type .lower ()
1249+ section_type_prefix = re .sub (
1250+ r"[/\\:*?\"<>|]" , "_" , section_type .lower ()
12491251 )
12501252
12511253 s3_key = (
12521254 f"document_sections/"
1253- f"{ escaped_section_type } /"
1255+ f"{ section_type_prefix } /"
12541256 f"date={ date_partition } /"
12551257 f"{ escaped_doc_id } _section_{ section .section_id } .parquet"
12561258 )
0 commit comments