11import argparse
22import os
3+ import re
34import time
45import uuid
6+ from datetime import datetime
57
68import pandas as pd
79
8- from tools .aws_functions import download_file_from_s3
10+ from tools .aws_functions import download_file_from_s3 , export_outputs_to_s3
911from tools .config import (
1012 ACCESS_LOGS_FOLDER ,
1113 ALLOW_LIST_PATH ,
4648 REMOVE_DUPLICATE_ROWS ,
4749 RETURN_REDACTED_PDF ,
4850 RUN_AWS_FUNCTIONS ,
51+ S3_OUTPUTS_BUCKET ,
52+ S3_OUTPUTS_FOLDER ,
4953 S3_USAGE_LOGS_FOLDER ,
5054 SAVE_LOGS_TO_CSV ,
5155 SAVE_LOGS_TO_DYNAMODB ,
56+ SAVE_OUTPUTS_TO_S3 ,
5257 SESSION_OUTPUT_FOLDER ,
5358 SPACY_MODEL_PATH ,
5459 TEXTRACT_JOBS_LOCAL_LOC ,
@@ -68,6 +73,34 @@ def _generate_session_hash() -> str:
6873 return str (uuid .uuid4 ())[:8 ]
6974
7075
76+ def _sanitize_folder_name (folder_name : str , max_length : int = 50 ) -> str :
77+ """
78+ Sanitize folder name for S3 compatibility.
79+
80+ Replaces 'strange' characters (anything that's not alphanumeric, dash, underscore, or full stop)
81+ with underscores, and limits the length to max_length characters.
82+
83+ Args:
84+ folder_name: Original folder name to sanitize
85+ max_length: Maximum length for the folder name (default: 50)
86+
87+ Returns:
88+ Sanitized folder name
89+ """
90+ if not folder_name :
91+ return folder_name
92+
93+ # Replace any character that's not alphanumeric, dash, underscore, or full stop with underscore
94+ # This handles @, commas, exclamation marks, spaces, etc.
95+ sanitized = re .sub (r"[^a-zA-Z0-9._-]" , "_" , folder_name )
96+
97+ # Limit length to max_length
98+ if len (sanitized ) > max_length :
99+ sanitized = sanitized [:max_length ]
100+
101+ return sanitized
102+
103+
71104def get_username_and_folders (
72105 username : str = "" ,
73106 output_folder_textbox : str = OUTPUT_FOLDER ,
@@ -85,22 +118,25 @@ def get_username_and_folders(
85118 else :
86119 out_session_hash = _generate_session_hash ()
87120
121+ # Sanitize session hash for S3 compatibility (especially important for S3 folder paths)
122+ sanitized_session_hash = _sanitize_folder_name (out_session_hash )
123+
88124 if session_output_folder :
89- output_folder = output_folder_textbox + out_session_hash + "/"
90- input_folder = input_folder_textbox + out_session_hash + "/"
125+ output_folder = output_folder_textbox + sanitized_session_hash + "/"
126+ input_folder = input_folder_textbox + sanitized_session_hash + "/"
91127
92128 textract_document_upload_input_folder = (
93- textract_document_upload_input_folder + "/" + out_session_hash
129+ textract_document_upload_input_folder + "/" + sanitized_session_hash
94130 )
95131 textract_document_upload_output_folder = (
96- textract_document_upload_output_folder + "/" + out_session_hash
132+ textract_document_upload_output_folder + "/" + sanitized_session_hash
97133 )
98134
99135 s3_textract_document_logs_subfolder = (
100- s3_textract_document_logs_subfolder + "/" + out_session_hash
136+ s3_textract_document_logs_subfolder + "/" + sanitized_session_hash
101137 )
102138 local_textract_document_logs_subfolder = (
103- local_textract_document_logs_subfolder + "/" + out_session_hash + "/"
139+ local_textract_document_logs_subfolder + "/" + sanitized_session_hash + "/"
104140 )
105141
106142 else :
@@ -196,6 +232,43 @@ def _download_s3_file_if_needed(
196232 raise Exception (f"Failed to download file from S3: { e } " )
197233
198234
235+ def _build_s3_output_folder (
236+ s3_outputs_folder : str ,
237+ session_hash : str ,
238+ save_to_user_folders : bool ,
239+ ) -> str :
240+ """
241+ Build the S3 output folder path with session hash and date suffix if needed.
242+
243+ Args:
244+ s3_outputs_folder: Base S3 folder path
245+ session_hash: Session hash/username
246+ save_to_user_folders: Whether to append session hash to folder path
247+
248+ Returns:
249+ Final S3 folder path with session hash and date suffix
250+ """
251+ if not s3_outputs_folder :
252+ return ""
253+
254+ # Append session hash if save_to_user_folders is enabled
255+ if save_to_user_folders and session_hash :
256+ sanitized_session_hash = _sanitize_folder_name (session_hash )
257+ s3_outputs_folder = (
258+ s3_outputs_folder .rstrip ("/" ) + "/" + sanitized_session_hash + "/"
259+ )
260+ else :
261+ # Ensure trailing slash
262+ if not s3_outputs_folder .endswith ("/" ):
263+ s3_outputs_folder = s3_outputs_folder + "/"
264+
265+ # Append today's date (YYYYMMDD/)
266+ today_suffix = datetime .now ().strftime ("%Y%m%d" ) + "/"
267+ s3_outputs_folder = s3_outputs_folder .rstrip ("/" ) + "/" + today_suffix
268+
269+ return s3_outputs_folder
270+
271+
199272# Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
200273CHOSEN_COMPREHEND_ENTITIES .extend (CUSTOM_ENTITIES )
201274FULL_COMPREHEND_ENTITY_LIST .extend (CUSTOM_ENTITIES )
@@ -386,6 +459,21 @@ def main(direct_mode_args={}):
386459 default = DOCUMENT_REDACTION_BUCKET ,
387460 help = "S3 bucket name for cloud operations." ,
388461 )
462+ general_group .add_argument (
463+ "--save_outputs_to_s3" ,
464+ default = SAVE_OUTPUTS_TO_S3 ,
465+ help = "Upload output files (redacted PDFs, anonymized documents, etc.) to S3 after processing." ,
466+ )
467+ general_group .add_argument (
468+ "--s3_outputs_folder" ,
469+ default = S3_OUTPUTS_FOLDER ,
470+ help = "S3 folder (key prefix) for saving output files. If left blank, outputs will not be uploaded even if --save_outputs_to_s3 is enabled." ,
471+ )
472+ general_group .add_argument (
473+ "--s3_outputs_bucket" ,
474+ default = S3_OUTPUTS_BUCKET ,
475+ help = "S3 bucket name for output files (defaults to --s3_bucket if not specified)." ,
476+ )
389477 general_group .add_argument (
390478 "--do_initial_clean" ,
391479 default = DO_INITIAL_TABULAR_DATA_CLEAN ,
@@ -754,6 +842,8 @@ def main(direct_mode_args={}):
754842 args .match_fuzzy_whole_phrase_bool = False
755843 # Convert save_to_user_folders to boolean (handles both string and boolean values)
756844 args .save_to_user_folders = convert_string_to_boolean (args .save_to_user_folders )
845+ # Convert save_outputs_to_s3 to boolean (handles both string and boolean values)
846+ args .save_outputs_to_s3 = convert_string_to_boolean (args .save_outputs_to_s3 )
757847
758848 # Combine extraction options
759849 extraction_options = (
@@ -814,6 +904,21 @@ def main(direct_mode_args={}):
814904 f"Conducting analyses with user { args .username } . Outputs will be saved to { args .output_dir } ."
815905 )
816906
907+ # Build S3 output folder path if S3 uploads are enabled
908+ s3_output_folder = ""
909+ if args .save_outputs_to_s3 and args .s3_outputs_folder :
910+ s3_output_folder = _build_s3_output_folder (
911+ s3_outputs_folder = args .s3_outputs_folder ,
912+ session_hash = session_hash ,
913+ save_to_user_folders = args .save_to_user_folders ,
914+ )
915+ if s3_output_folder :
916+ print (f"S3 output folder: s3://{ args .s3_outputs_bucket } /{ s3_output_folder } " )
917+ elif args .save_outputs_to_s3 and not args .s3_outputs_folder :
918+ print (
919+ "Warning: --save_outputs_to_s3 is enabled but --s3_outputs_folder is not set. Outputs will not be uploaded to S3."
920+ )
921+
817922 # --- Route to the Correct Workflow Based on Task and File Type ---
818923
819924 # Validate input_file requirement for tasks that need it
@@ -1006,6 +1111,28 @@ def main(direct_mode_args={}):
10061111 if log_files :
10071112 print ("Log Files:" , sorted (log_files ))
10081113
1114+ # Upload output files to S3 if enabled
1115+ if args .save_outputs_to_s3 and s3_output_folder and output_files :
1116+ print ("\n --- Uploading output files to S3 ---" )
1117+ try :
1118+ # Get base file name for organizing outputs
1119+ (
1120+ os .path .splitext (os .path .basename (args .input_file [0 ]))[0 ]
1121+ if args .input_file
1122+ else None
1123+ )
1124+ export_outputs_to_s3 (
1125+ file_list_state = output_files ,
1126+ s3_output_folder_state_value = s3_output_folder ,
1127+ save_outputs_to_s3_flag = args .save_outputs_to_s3 ,
1128+ base_file_state = (
1129+ args .input_file [0 ] if args .input_file else None
1130+ ),
1131+ s3_bucket = args .s3_outputs_bucket ,
1132+ )
1133+ except Exception as e :
1134+ print (f"Warning: Could not upload output files to S3: { e } " )
1135+
10091136 except Exception as e :
10101137 print (
10111138 f"\n An error occurred during the PDF/Image redaction workflow: { e } "
@@ -1115,6 +1242,22 @@ def main(direct_mode_args={}):
11151242 if log_files :
11161243 print ("Log Files:" , sorted (log_files ))
11171244
1245+ # Upload output files to S3 if enabled
1246+ if args .save_outputs_to_s3 and s3_output_folder and output_files :
1247+ print ("\n --- Uploading output files to S3 ---" )
1248+ try :
1249+ export_outputs_to_s3 (
1250+ file_list_state = output_files ,
1251+ s3_output_folder_state_value = s3_output_folder ,
1252+ save_outputs_to_s3_flag = args .save_outputs_to_s3 ,
1253+ base_file_state = (
1254+ args .input_file [0 ] if args .input_file else None
1255+ ),
1256+ s3_bucket = args .s3_outputs_bucket ,
1257+ )
1258+ except Exception as e :
1259+ print (f"Warning: Could not upload output files to S3: { e } " )
1260+
11181261 except Exception as e :
11191262 print (
11201263 f"\n An error occurred during the Word/Tabular anonymisation workflow: { e } "
@@ -1173,6 +1316,22 @@ def main(direct_mode_args={}):
11731316 if output_paths :
11741317 print ("Generated Files:" , sorted (output_paths ))
11751318
1319+ # Upload output files to S3 if enabled
1320+ if args .save_outputs_to_s3 and s3_output_folder and output_paths :
1321+ print ("\n --- Uploading output files to S3 ---" )
1322+ try :
1323+ export_outputs_to_s3 (
1324+ file_list_state = output_paths ,
1325+ s3_output_folder_state_value = s3_output_folder ,
1326+ save_outputs_to_s3_flag = args .save_outputs_to_s3 ,
1327+ base_file_state = (
1328+ args .input_file [0 ] if args .input_file else None
1329+ ),
1330+ s3_bucket = args .s3_outputs_bucket ,
1331+ )
1332+ except Exception as e :
1333+ print (f"Warning: Could not upload output files to S3: { e } " )
1334+
11761335 else :
11771336 print (
11781337 "Error: Page duplicate detection requires CSV files with OCR data."
@@ -1314,6 +1473,22 @@ def main(direct_mode_args={}):
13141473 if output_paths :
13151474 print ("Generated Files:" , sorted (output_paths ))
13161475
1476+ # Upload output files to S3 if enabled
1477+ if args .save_outputs_to_s3 and s3_output_folder and output_paths :
1478+ print ("\n --- Uploading output files to S3 ---" )
1479+ try :
1480+ export_outputs_to_s3 (
1481+ file_list_state = output_paths ,
1482+ s3_output_folder_state_value = s3_output_folder ,
1483+ save_outputs_to_s3_flag = args .save_outputs_to_s3 ,
1484+ base_file_state = (
1485+ args .input_file [0 ] if args .input_file else None
1486+ ),
1487+ s3_bucket = args .s3_outputs_bucket ,
1488+ )
1489+ except Exception as e :
1490+ print (f"Warning: Could not upload output files to S3: { e } " )
1491+
13171492 else :
13181493 print (
13191494 "Error: Tabular duplicate detection requires CSV, Excel, or Parquet files."
0 commit comments