Skip to content

Commit 4cf5578

Browse files
author
Bob Strahan
committed
Fix Glue table data issue for document classes with dashes
1 parent 572dcdb commit 4cf5578

File tree

2 files changed

+13
-8
lines changed

2 files changed

+13
-8
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ SPDX-License-Identifier: MIT-0
77

88
### Added
99

10+
### Fixed
11+
- Fix missing data in Glue tables when using a document class that contains a dash (-).
12+
1013
## [0.3.16]
1114

1215
### Added

lib/idp_common_pkg/idp_common/reporting/save_reporting_data.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -375,9 +375,11 @@ def _create_or_update_glue_table(
375375
)
376376
return False
377377

378-
# Escape section_type to make it table-name-safe
379-
escaped_section_type = re.sub(r"[/\\:*?\"<>|-]", "_", section_type.lower())
380-
table_name = f"document_sections_{escaped_section_type}"
378+
# Escape section_type to make it table-name-safe and s3 prefix-safe
379+
# Note: we escape '-' in tablename but not in s3 prefix, only to provide backward compatability for data already stored.
380+
section_type_tablename = re.sub(r"[/\\:*?\"<>|-]", "_", section_type.lower())
381+
section_type_prefix = re.sub(r"[/\\:*?\"<>|]", "_", section_type.lower())
382+
table_name = f"document_sections_{section_type_tablename}"
381383

382384
# Convert schema to Glue columns
383385
columns = self._convert_schema_to_glue_columns(schema)
@@ -388,7 +390,7 @@ def _create_or_update_glue_table(
388390
"Description": f"Document sections table for type: {section_type}",
389391
"StorageDescriptor": {
390392
"Columns": columns,
391-
"Location": f"s3://{self.reporting_bucket}/document_sections/{escaped_section_type}/",
393+
"Location": f"s3://{self.reporting_bucket}/document_sections/{section_type_prefix}/",
392394
"InputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
393395
"OutputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
394396
"Compressed": True,
@@ -407,7 +409,7 @@ def _create_or_update_glue_table(
407409
"projection.date.range": "2024-01-01,2030-12-31",
408410
"projection.date.interval": "1",
409411
"projection.date.interval.unit": "DAYS",
410-
"storage.location.template": f"s3://{self.reporting_bucket}/document_sections/{escaped_section_type}/date=${{date}}/",
412+
"storage.location.template": f"s3://{self.reporting_bucket}/document_sections/{section_type_prefix}/date=${{date}}/",
411413
},
412414
}
413415

@@ -1244,13 +1246,13 @@ def save_document_sections(self, document: Document) -> Optional[Dict[str, Any]]
12441246
section.classification if section.classification else "unknown"
12451247
)
12461248
# Escape section_type to make it filesystem-safe and lowercase for consistency
1247-
escaped_section_type = re.sub(
1248-
r"[/\\:*?\"<>|-]", "_", section_type.lower()
1249+
section_type_prefix = re.sub(
1250+
r"[/\\:*?\"<>|]", "_", section_type.lower()
12491251
)
12501252

12511253
s3_key = (
12521254
f"document_sections/"
1253-
f"{escaped_section_type}/"
1255+
f"{section_type_prefix}/"
12541256
f"date={date_partition}/"
12551257
f"{escaped_doc_id}_section_{section.section_id}.parquet"
12561258
)

0 commit comments

Comments
 (0)