Skip to content

Commit ba5f5dd

Browse files
Merge pull request #114 from seanpedrick-case/dev
Added save to s3 option to cli_redact
2 parents ffec20b + 4663c98 commit ba5f5dd

File tree

3 files changed

+184
-9
lines changed

3 files changed

+184
-9
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ short_description: OCR / redact PDF documents and tabular data
1111
---
1212
# Document redaction
1313

14-
version: 1.6.4
14+
version: 1.6.5
1515

1616
Redact personally identifiable information (PII) from documents (pdf, png, jpg), Word files (docx), or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a full walkthrough of all the features in the app.
1717

cli_redact.py

Lines changed: 182 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import argparse
22
import os
3+
import re
34
import time
45
import uuid
6+
from datetime import datetime
57

68
import pandas as pd
79

8-
from tools.aws_functions import download_file_from_s3
10+
from tools.aws_functions import download_file_from_s3, export_outputs_to_s3
911
from tools.config import (
1012
ACCESS_LOGS_FOLDER,
1113
ALLOW_LIST_PATH,
@@ -46,9 +48,12 @@
4648
REMOVE_DUPLICATE_ROWS,
4749
RETURN_REDACTED_PDF,
4850
RUN_AWS_FUNCTIONS,
51+
S3_OUTPUTS_BUCKET,
52+
S3_OUTPUTS_FOLDER,
4953
S3_USAGE_LOGS_FOLDER,
5054
SAVE_LOGS_TO_CSV,
5155
SAVE_LOGS_TO_DYNAMODB,
56+
SAVE_OUTPUTS_TO_S3,
5257
SESSION_OUTPUT_FOLDER,
5358
SPACY_MODEL_PATH,
5459
TEXTRACT_JOBS_LOCAL_LOC,
@@ -68,6 +73,34 @@ def _generate_session_hash() -> str:
6873
return str(uuid.uuid4())[:8]
6974

7075

76+
def _sanitize_folder_name(folder_name: str, max_length: int = 50) -> str:
77+
"""
78+
Sanitize folder name for S3 compatibility.
79+
80+
Replaces 'strange' characters (anything that's not alphanumeric, dash, underscore, or full stop)
81+
with underscores, and limits the length to max_length characters.
82+
83+
Args:
84+
folder_name: Original folder name to sanitize
85+
max_length: Maximum length for the folder name (default: 50)
86+
87+
Returns:
88+
Sanitized folder name
89+
"""
90+
if not folder_name:
91+
return folder_name
92+
93+
# Replace any character that's not alphanumeric, dash, underscore, or full stop with underscore
94+
# This handles @, commas, exclamation marks, spaces, etc.
95+
sanitized = re.sub(r"[^a-zA-Z0-9._-]", "_", folder_name)
96+
97+
# Limit length to max_length
98+
if len(sanitized) > max_length:
99+
sanitized = sanitized[:max_length]
100+
101+
return sanitized
102+
103+
71104
def get_username_and_folders(
72105
username: str = "",
73106
output_folder_textbox: str = OUTPUT_FOLDER,
@@ -85,22 +118,25 @@ def get_username_and_folders(
85118
else:
86119
out_session_hash = _generate_session_hash()
87120

121+
# Sanitize session hash for S3 compatibility (especially important for S3 folder paths)
122+
sanitized_session_hash = _sanitize_folder_name(out_session_hash)
123+
88124
if session_output_folder:
89-
output_folder = output_folder_textbox + out_session_hash + "/"
90-
input_folder = input_folder_textbox + out_session_hash + "/"
125+
output_folder = output_folder_textbox + sanitized_session_hash + "/"
126+
input_folder = input_folder_textbox + sanitized_session_hash + "/"
91127

92128
textract_document_upload_input_folder = (
93-
textract_document_upload_input_folder + "/" + out_session_hash
129+
textract_document_upload_input_folder + "/" + sanitized_session_hash
94130
)
95131
textract_document_upload_output_folder = (
96-
textract_document_upload_output_folder + "/" + out_session_hash
132+
textract_document_upload_output_folder + "/" + sanitized_session_hash
97133
)
98134

99135
s3_textract_document_logs_subfolder = (
100-
s3_textract_document_logs_subfolder + "/" + out_session_hash
136+
s3_textract_document_logs_subfolder + "/" + sanitized_session_hash
101137
)
102138
local_textract_document_logs_subfolder = (
103-
local_textract_document_logs_subfolder + "/" + out_session_hash + "/"
139+
local_textract_document_logs_subfolder + "/" + sanitized_session_hash + "/"
104140
)
105141

106142
else:
@@ -196,6 +232,43 @@ def _download_s3_file_if_needed(
196232
raise Exception(f"Failed to download file from S3: {e}")
197233

198234

235+
def _build_s3_output_folder(
236+
s3_outputs_folder: str,
237+
session_hash: str,
238+
save_to_user_folders: bool,
239+
) -> str:
240+
"""
241+
Build the S3 output folder path with session hash and date suffix if needed.
242+
243+
Args:
244+
s3_outputs_folder: Base S3 folder path
245+
session_hash: Session hash/username
246+
save_to_user_folders: Whether to append session hash to folder path
247+
248+
Returns:
249+
Final S3 folder path with session hash and date suffix
250+
"""
251+
if not s3_outputs_folder:
252+
return ""
253+
254+
# Append session hash if save_to_user_folders is enabled
255+
if save_to_user_folders and session_hash:
256+
sanitized_session_hash = _sanitize_folder_name(session_hash)
257+
s3_outputs_folder = (
258+
s3_outputs_folder.rstrip("/") + "/" + sanitized_session_hash + "/"
259+
)
260+
else:
261+
# Ensure trailing slash
262+
if not s3_outputs_folder.endswith("/"):
263+
s3_outputs_folder = s3_outputs_folder + "/"
264+
265+
# Append today's date (YYYYMMDD/)
266+
today_suffix = datetime.now().strftime("%Y%m%d") + "/"
267+
s3_outputs_folder = s3_outputs_folder.rstrip("/") + "/" + today_suffix
268+
269+
return s3_outputs_folder
270+
271+
199272
# Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
200273
CHOSEN_COMPREHEND_ENTITIES.extend(CUSTOM_ENTITIES)
201274
FULL_COMPREHEND_ENTITY_LIST.extend(CUSTOM_ENTITIES)
@@ -386,6 +459,21 @@ def main(direct_mode_args={}):
386459
default=DOCUMENT_REDACTION_BUCKET,
387460
help="S3 bucket name for cloud operations.",
388461
)
462+
general_group.add_argument(
463+
"--save_outputs_to_s3",
464+
default=SAVE_OUTPUTS_TO_S3,
465+
help="Upload output files (redacted PDFs, anonymized documents, etc.) to S3 after processing.",
466+
)
467+
general_group.add_argument(
468+
"--s3_outputs_folder",
469+
default=S3_OUTPUTS_FOLDER,
470+
help="S3 folder (key prefix) for saving output files. If left blank, outputs will not be uploaded even if --save_outputs_to_s3 is enabled.",
471+
)
472+
general_group.add_argument(
473+
"--s3_outputs_bucket",
474+
default=S3_OUTPUTS_BUCKET,
475+
help="S3 bucket name for output files (defaults to --s3_bucket if not specified).",
476+
)
389477
general_group.add_argument(
390478
"--do_initial_clean",
391479
default=DO_INITIAL_TABULAR_DATA_CLEAN,
@@ -754,6 +842,8 @@ def main(direct_mode_args={}):
754842
args.match_fuzzy_whole_phrase_bool = False
755843
# Convert save_to_user_folders to boolean (handles both string and boolean values)
756844
args.save_to_user_folders = convert_string_to_boolean(args.save_to_user_folders)
845+
# Convert save_outputs_to_s3 to boolean (handles both string and boolean values)
846+
args.save_outputs_to_s3 = convert_string_to_boolean(args.save_outputs_to_s3)
757847

758848
# Combine extraction options
759849
extraction_options = (
@@ -814,6 +904,21 @@ def main(direct_mode_args={}):
814904
f"Conducting analyses with user {args.username}. Outputs will be saved to {args.output_dir}."
815905
)
816906

907+
# Build S3 output folder path if S3 uploads are enabled
908+
s3_output_folder = ""
909+
if args.save_outputs_to_s3 and args.s3_outputs_folder:
910+
s3_output_folder = _build_s3_output_folder(
911+
s3_outputs_folder=args.s3_outputs_folder,
912+
session_hash=session_hash,
913+
save_to_user_folders=args.save_to_user_folders,
914+
)
915+
if s3_output_folder:
916+
print(f"S3 output folder: s3://{args.s3_outputs_bucket}/{s3_output_folder}")
917+
elif args.save_outputs_to_s3 and not args.s3_outputs_folder:
918+
print(
919+
"Warning: --save_outputs_to_s3 is enabled but --s3_outputs_folder is not set. Outputs will not be uploaded to S3."
920+
)
921+
817922
# --- Route to the Correct Workflow Based on Task and File Type ---
818923

819924
# Validate input_file requirement for tasks that need it
@@ -1006,6 +1111,28 @@ def main(direct_mode_args={}):
10061111
if log_files:
10071112
print("Log Files:", sorted(log_files))
10081113

1114+
# Upload output files to S3 if enabled
1115+
if args.save_outputs_to_s3 and s3_output_folder and output_files:
1116+
print("\n--- Uploading output files to S3 ---")
1117+
try:
1118+
# Get base file name for organizing outputs
1119+
(
1120+
os.path.splitext(os.path.basename(args.input_file[0]))[0]
1121+
if args.input_file
1122+
else None
1123+
)
1124+
export_outputs_to_s3(
1125+
file_list_state=output_files,
1126+
s3_output_folder_state_value=s3_output_folder,
1127+
save_outputs_to_s3_flag=args.save_outputs_to_s3,
1128+
base_file_state=(
1129+
args.input_file[0] if args.input_file else None
1130+
),
1131+
s3_bucket=args.s3_outputs_bucket,
1132+
)
1133+
except Exception as e:
1134+
print(f"Warning: Could not upload output files to S3: {e}")
1135+
10091136
except Exception as e:
10101137
print(
10111138
f"\nAn error occurred during the PDF/Image redaction workflow: {e}"
@@ -1115,6 +1242,22 @@ def main(direct_mode_args={}):
11151242
if log_files:
11161243
print("Log Files:", sorted(log_files))
11171244

1245+
# Upload output files to S3 if enabled
1246+
if args.save_outputs_to_s3 and s3_output_folder and output_files:
1247+
print("\n--- Uploading output files to S3 ---")
1248+
try:
1249+
export_outputs_to_s3(
1250+
file_list_state=output_files,
1251+
s3_output_folder_state_value=s3_output_folder,
1252+
save_outputs_to_s3_flag=args.save_outputs_to_s3,
1253+
base_file_state=(
1254+
args.input_file[0] if args.input_file else None
1255+
),
1256+
s3_bucket=args.s3_outputs_bucket,
1257+
)
1258+
except Exception as e:
1259+
print(f"Warning: Could not upload output files to S3: {e}")
1260+
11181261
except Exception as e:
11191262
print(
11201263
f"\nAn error occurred during the Word/Tabular anonymisation workflow: {e}"
@@ -1173,6 +1316,22 @@ def main(direct_mode_args={}):
11731316
if output_paths:
11741317
print("Generated Files:", sorted(output_paths))
11751318

1319+
# Upload output files to S3 if enabled
1320+
if args.save_outputs_to_s3 and s3_output_folder and output_paths:
1321+
print("\n--- Uploading output files to S3 ---")
1322+
try:
1323+
export_outputs_to_s3(
1324+
file_list_state=output_paths,
1325+
s3_output_folder_state_value=s3_output_folder,
1326+
save_outputs_to_s3_flag=args.save_outputs_to_s3,
1327+
base_file_state=(
1328+
args.input_file[0] if args.input_file else None
1329+
),
1330+
s3_bucket=args.s3_outputs_bucket,
1331+
)
1332+
except Exception as e:
1333+
print(f"Warning: Could not upload output files to S3: {e}")
1334+
11761335
else:
11771336
print(
11781337
"Error: Page duplicate detection requires CSV files with OCR data."
@@ -1314,6 +1473,22 @@ def main(direct_mode_args={}):
13141473
if output_paths:
13151474
print("Generated Files:", sorted(output_paths))
13161475

1476+
# Upload output files to S3 if enabled
1477+
if args.save_outputs_to_s3 and s3_output_folder and output_paths:
1478+
print("\n--- Uploading output files to S3 ---")
1479+
try:
1480+
export_outputs_to_s3(
1481+
file_list_state=output_paths,
1482+
s3_output_folder_state_value=s3_output_folder,
1483+
save_outputs_to_s3_flag=args.save_outputs_to_s3,
1484+
base_file_state=(
1485+
args.input_file[0] if args.input_file else None
1486+
),
1487+
s3_bucket=args.s3_outputs_bucket,
1488+
)
1489+
except Exception as e:
1490+
print(f"Warning: Could not upload output files to S3: {e}")
1491+
13171492
else:
13181493
print(
13191494
"Error: Tabular duplicate detection requires CSV, Excel, or Parquet files."

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "doc_redaction"
7-
version = "1.6.4"
7+
version = "1.6.5"
88
description = "Redact PDF/image-based documents, Word, or CSV/XLSX files using a Gradio-based GUI interface"
99
readme = "README.md"
1010
authors = [

0 commit comments

Comments
 (0)