Skip to content

Commit 58bd5e8

Browse files
Merge pull request #127 from seanpedrick-case/dev
Package updates. Minor documentation updates. Fixes to review pdf load efficiency, ocr efficiency, and minor deduplication bug
2 parents 13caf87 + ab39d74 commit 58bd5e8

15 files changed

+1298
-727
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,3 +40,5 @@ test/usage/*
4040
model_cache/*
4141
sanitized_file/*
4242
src/doc_redaction.egg-info/*
43+
44+
**/*.quarto_ipynb

README.md

Lines changed: 259 additions & 117 deletions
Large diffs are not rendered by default.

app.py

Lines changed: 113 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,11 @@
201201
)
202202
from tools.custom_csvlogger import CSVLogger_custom
203203
from tools.data_anonymise import anonymise_files_with_open_text
204-
from tools.file_conversion import get_input_file_names, prepare_image_or_pdf
204+
from tools.file_conversion import (
205+
get_document_file_names,
206+
get_input_file_names,
207+
prepare_image_or_pdf,
208+
)
205209
from tools.file_redaction import choose_and_run_redactor
206210
from tools.find_duplicate_pages import (
207211
apply_whole_page_redactions_from_list,
@@ -3973,15 +3977,19 @@ def show_tabular_info_box_on_click(
39733977
with gr.Accordion("Log file outputs", open=False):
39743978
log_files_output = gr.File(label="Log file output", interactive=False)
39753979

3976-
with gr.Accordion("S3 output settings", open=False):
3980+
with gr.Accordion(
3981+
"S3 output settings", open=False, visible=SAVE_OUTPUTS_TO_S3
3982+
):
39773983
save_outputs_to_s3_checkbox = gr.Checkbox(
3978-
label="Save redaction outputs to S3 (requires RUN_AWS_FUNCTIONS=True and S3_OUTPUTS_FOLDER set)",
3984+
label="Save redaction outputs to S3",
39793985
value=SAVE_OUTPUTS_TO_S3,
3986+
visible=SAVE_OUTPUTS_TO_S3,
39803987
)
39813988
s3_output_folder_display = gr.Textbox(
3982-
label="Resolved S3 outputs folder",
3989+
label="S3 outputs folder",
39833990
value="",
39843991
interactive=False,
3992+
visible=SAVE_OUTPUTS_TO_S3,
39853993
)
39863994

39873995
with gr.Accordion("Combine multiple review files", open=False):
@@ -4358,8 +4366,23 @@ def handle_main_redaction_method_selection(redaction_method):
43584366
}
43594367
}"""
43604368

4369+
def check_duplicate_pages_checkbox(redact_duplicate_pages_checkbox_value: bool):
4370+
if not redact_duplicate_pages_checkbox_value:
4371+
# Silently raise an error to avoid showing a popup
4372+
return
4373+
if redact_duplicate_pages_checkbox_value:
4374+
print("Redact duplicate pages checkbox is enabled, identifying duplicates")
4375+
sys.tracebacklimit = 0 # Suppress traceback
4376+
raise ProcessStop(
4377+
"Redact duplicate pages checkbox is enabled, identifying duplicates."
4378+
)
4379+
4380+
def restore_sys_tracebacklimit():
4381+
sys.tracebacklimit = 1000 # Restore traceback limit
4382+
return
4383+
43614384
in_doc_files.upload(
4362-
fn=get_input_file_names,
4385+
fn=get_document_file_names,
43634386
inputs=[in_doc_files],
43644387
outputs=[
43654388
doc_file_name_no_extension_textbox,
@@ -4426,6 +4449,75 @@ def handle_main_redaction_method_selection(redaction_method):
44264449
outputs=[relevant_ocr_output_with_words_found_checkbox],
44274450
)
44284451

4452+
# Same process as above for walkthrough file input
4453+
walkthrough_file_input.upload(
4454+
fn=get_document_file_names,
4455+
inputs=[walkthrough_file_input],
4456+
outputs=[
4457+
doc_file_name_no_extension_textbox,
4458+
doc_file_name_with_extension_textbox,
4459+
doc_full_file_name_textbox,
4460+
doc_file_name_textbox_list,
4461+
total_pdf_page_count,
4462+
],
4463+
).success(
4464+
fn=prepare_image_or_pdf,
4465+
inputs=[
4466+
walkthrough_file_input,
4467+
text_extract_method_radio,
4468+
all_page_line_level_ocr_results_df_base,
4469+
all_page_line_level_ocr_results_with_words_df_base,
4470+
latest_file_completed_num,
4471+
redaction_output_summary_textbox,
4472+
first_loop_state,
4473+
annotate_max_pages,
4474+
all_image_annotations_state,
4475+
prepare_for_review_bool_false,
4476+
in_fully_redacted_list_state,
4477+
output_folder_textbox,
4478+
input_folder_textbox,
4479+
prepare_images_bool_false,
4480+
page_sizes,
4481+
pdf_doc_state,
4482+
page_min,
4483+
page_max,
4484+
],
4485+
outputs=[
4486+
redaction_output_summary_textbox,
4487+
prepared_pdf_state,
4488+
images_pdf_state,
4489+
annotate_max_pages,
4490+
annotate_max_pages_bottom,
4491+
pdf_doc_state,
4492+
all_image_annotations_state,
4493+
review_file_df,
4494+
document_cropboxes,
4495+
page_sizes,
4496+
textract_output_found_checkbox,
4497+
all_img_details_state,
4498+
all_page_line_level_ocr_results_df_base,
4499+
relevant_ocr_output_with_words_found_checkbox,
4500+
all_page_line_level_ocr_results_with_words_df_base,
4501+
],
4502+
show_progress_on=[redaction_output_summary_textbox],
4503+
).success(
4504+
fn=check_for_existing_textract_file,
4505+
inputs=[
4506+
doc_file_name_no_extension_textbox,
4507+
output_folder_textbox,
4508+
handwrite_signature_checkbox,
4509+
],
4510+
outputs=[textract_output_found_checkbox],
4511+
).success(
4512+
fn=check_for_relevant_ocr_output_with_words,
4513+
inputs=[
4514+
doc_file_name_no_extension_textbox,
4515+
text_extract_method_radio,
4516+
output_folder_textbox,
4517+
],
4518+
outputs=[relevant_ocr_output_with_words_found_checkbox],
4519+
)
4520+
44294521
# Run redaction function
44304522
document_redact_btn.click(
44314523
fn=reset_state_vars,
@@ -4555,6 +4647,7 @@ def handle_main_redaction_method_selection(redaction_method):
45554647
llm_model_name_textbox,
45564648
llm_total_input_tokens_number,
45574649
llm_total_output_tokens_number,
4650+
total_pdf_page_count,
45584651
],
45594652
api_name="redact_doc",
45604653
show_progress_on=[redaction_output_summary_textbox],
@@ -4599,21 +4692,16 @@ def handle_main_redaction_method_selection(redaction_method):
45994692
],
46004693
show_progress_on=[annotator],
46014694
)
4602-
4603-
def check_duplicate_pages_checkbox(redact_duplicate_pages_checkbox_value: bool):
4604-
if not redact_duplicate_pages_checkbox_value:
4605-
# Silently raise an error to avoid showing a popup
4606-
return
4607-
if redact_duplicate_pages_checkbox_value:
4608-
print("Redact duplicate pages checkbox is enabled, identifying duplicates")
4609-
sys.tracebacklimit = 0 # Suppress traceback
4610-
raise ProcessStop(
4611-
"Redact duplicate pages checkbox is enabled, identifying duplicates."
4612-
)
4613-
4614-
def restore_sys_tracebacklimit():
4615-
sys.tracebacklimit = 1000 # Restore traceback limit
4616-
return
4695+
# ).success(
4696+
# fn=check_duplicate_pages_checkbox,
4697+
# inputs=[redact_duplicate_pages_checkbox],
4698+
# outputs=None,
4699+
# ).failure(
4700+
# fn=lambda: None, js=TRIGGER_DUPLICATE_DETECTION_BUTTON
4701+
# ).then(
4702+
# fn=restore_sys_tracebacklimit,
4703+
# outputs=None,
4704+
# )
46174705

46184706
# If a file has been completed, the function will continue onto the next document
46194707
latest_file_completed_num.change(
@@ -4715,6 +4803,7 @@ def restore_sys_tracebacklimit():
47154803
llm_model_name_textbox,
47164804
llm_total_input_tokens_number,
47174805
llm_total_output_tokens_number,
4806+
total_pdf_page_count,
47184807
],
47194808
show_progress_on=[redaction_output_summary_textbox],
47204809
).success(
@@ -5098,6 +5187,7 @@ def restore_sys_tracebacklimit():
50985187
llm_model_name_textbox,
50995188
llm_total_input_tokens_number,
51005189
llm_total_output_tokens_number,
5190+
total_pdf_page_count,
51015191
],
51025192
show_progress_on=[redaction_output_summary_textbox],
51035193
).success(
@@ -5159,7 +5249,7 @@ def restore_sys_tracebacklimit():
51595249
inputs=None,
51605250
outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base],
51615251
).success(
5162-
fn=get_input_file_names,
5252+
fn=get_document_file_names,
51635253
inputs=[input_pdf_for_review],
51645254
outputs=[
51655255
doc_file_name_no_extension_textbox,
@@ -7097,7 +7187,7 @@ def run_search_with_regex_option(
70977187

70987188
# Convert review file to xfdf Adobe format
70997189
convert_review_file_to_adobe_btn.click(
7100-
fn=get_input_file_names,
7190+
fn=get_document_file_names,
71017191
inputs=[input_pdf_for_review],
71027192
outputs=[
71037193
doc_file_name_no_extension_textbox,
@@ -7170,7 +7260,7 @@ def run_search_with_regex_option(
71707260

71717261
# Convert xfdf Adobe file back to review_file.csv
71727262
convert_adobe_to_review_file_btn.click(
7173-
fn=get_input_file_names,
7263+
fn=get_document_file_names,
71747264
inputs=[adobe_review_files_out],
71757265
outputs=[
71767266
doc_file_name_no_extension_textbox,

cli_redact.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1228,6 +1228,7 @@ def main(direct_mode_args={}):
12281228
llm_model_name,
12291229
llm_total_input_tokens,
12301230
llm_total_output_tokens,
1231+
_,
12311232
) = choose_and_run_redactor(
12321233
file_paths=args.input_file,
12331234
prepared_pdf_file_paths=prepared_pdf_paths,

intros/long_intro.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Document redaction
22

3-
Redact personally identifiable information (PII) from documents (pdf, png, jpg), Word files (docx), or tabular data (xlsx/csv/parquet). Please see the [User Guide]({USER_GUIDE_URL}) for a full walkthrough of all the features in the app.
3+
Redact personally identifiable information (PII) from documents (PDF, PNG, JPG), Word files (DOCX), or tabular data (XLSX/CSV/Parquet). Please see the [User Guide]({USER_GUIDE_URL}) for a full walkthrough of all the features in the app.
44

55
To extract text from documents, the 'Local' options are PikePDF for PDFs with selectable text, and OCR with Tesseract. Use AWS Textract to extract more complex elements e.g. handwriting, signatures, or unclear text. For PII identification, 'Local' (based on spaCy) gives good results if you are looking for common names or terms, or a custom list of terms to redact (see Redaction settings). AWS Comprehend gives better results at a small cost.
66

intros/short_intro.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Document redaction
22

3-
Redact personally identifiable information (PII) from documents (pdf, png, jpg), Word files (docx), or tabular data (xlsx/csv/parquet). Please see the [User Guide]({USER_GUIDE_URL}) for a full walkthrough of all the features and settings.
3+
Redact personally identifiable information (PII) from documents (PDF, PNG, JPG), Word files (DOCX), or tabular data (XLSX/CSV/Parquet). Please see the [User Guide]({USER_GUIDE_URL}) for a full walkthrough of all the features and settings.
44

55
To start, upload a document below (or click on an example), then click 'Extract text and redact document' to redact the document. Then, view and modify suggested redactions on the 'Review redactions' tab.
66

pyproject.toml

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "doc_redaction"
7-
version = "1.7.0"
7+
version = "1.7.1"
88
description = "Redact PDF/image-based documents, Word, or CSV/XLSX files using a Gradio-based GUI interface"
99
readme = "README.md"
1010
authors = [
@@ -38,21 +38,21 @@ classifiers = [
3838
]
3939
requires-python = ">=3.10"
4040
dependencies = [
41-
"pdfminer.six==20251107",
41+
"pdfminer.six==20260107",
4242
"pdf2image==1.17.0",
43-
"pymupdf==1.26.6",
43+
"pymupdf==1.26.7",
4444
"bleach==6.3.0",
45-
"opencv-python==4.12.0.88",
45+
"opencv-python==4.13.0.90",
4646
"presidio_analyzer==2.2.360",
4747
"presidio_anonymizer==2.2.360",
4848
"presidio-image-redactor==0.0.57",
49-
"pikepdf==9.11.0",
49+
"pikepdf==10.3.0",
5050
"pandas==2.3.3",
51-
"scikit-learn==1.7.2",
52-
"spacy==3.8.8",
51+
"scikit-learn<=1.8.0",
52+
"spacy==3.8.11",
5353
"en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
5454
"gradio==5.49.1",
55-
"boto3==1.40.72",
55+
"boto3==1.42.40",
5656
"pyarrow==21.0.0",
5757
"openpyxl==3.1.5",
5858
"Faker==37.8.0",
@@ -66,7 +66,7 @@ dependencies = [
6666
"polars==1.35.2",
6767
"defusedxml==0.7.1",
6868
"numpy==2.2.6",
69-
"spaces==0.42.1",
69+
"spaces==0.47.0",
7070
"google-genai>=1.52.0",
7171
"openai>=2.8.1",
7272
"markdown>=3.7"
@@ -93,10 +93,10 @@ paddle = [
9393
vlm = [
9494
"torch>=2.5.1,<=2.8.0",
9595
"torchvision>=0.20.1",
96-
"transformers==4.57.2",
97-
"accelerate==1.11.0",
98-
"bitsandbytes==0.48.2",
99-
"sentencepiece==0.2.1", # Needed for PaddleOCRVL
96+
"transformers==4.57.6",
97+
"accelerate>=1.11.0",
98+
"bitsandbytes>=0.48.2",
99+
"sentencepiece>=0.2.1", # Needed for PaddleOCRVL
100100
]
101101

102102
# Run Gradio as an mcp server

requirements.txt

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ bleach==6.3.0
55
polars==1.35.2
66
pyarrow==21.0.0
77
openpyxl==3.1.5
8-
boto3==1.40.72
8+
boto3==1.42.40
99
python-dotenv==1.0.1
1010
defusedxml==0.7.1
1111
Faker==37.8.0
@@ -15,11 +15,11 @@ markdown>=3.7
1515
tabulate>=0.9.0
1616

1717
# --- PDF / OCR / Redaction tools ---
18-
pdfminer.six==20251107
18+
pdfminer.six==20260107
1919
pdf2image==1.17.0
20-
pymupdf==1.26.6
21-
pikepdf==9.11.0
22-
opencv-python==4.12.0.88
20+
pymupdf==1.26.7
21+
pikepdf==10.3.0
22+
opencv-python==4.13.0.90
2323
presidio_analyzer==2.2.360
2424
presidio_anonymizer==2.2.360
2525
presidio-image-redactor==0.0.57
@@ -30,17 +30,17 @@ python-docx==1.2.0
3030
# --- Gradio and apps ---
3131
gradio==5.49.1
3232
https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl # Custom annotator version with rotation, zoom, labels, and box IDs
33-
spaces==0.42.1
33+
spaces==0.47.0
3434

3535
# --- AWS Lambda runtime ---
3636
awslambdaric==3.1.1
3737

3838
# --- Machine learning / NLP ---
39-
scikit-learn==1.7.2
40-
spacy==3.8.8
39+
scikit-learn<=1.8.0
40+
spacy==3.8.11
4141
spaczz==0.6.1
4242
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
43-
transformers==4.57.2
43+
transformers==4.57.6
4444
accelerate==1.11.0
4545
bitsandbytes==0.48.2
4646
sentencepiece==0.2.1

0 commit comments

Comments
 (0)