seanpedrick-case
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 259 additions & 117 deletions b/‎README.md‎
Lines changed: 259 additions & 117 deletions
diff --git a/‎app.py‎
Lines changed: 113 additions & 23 deletions b/‎app.py‎
Lines changed: 113 additions & 23 deletions
diff --git a/‎cli_redact.py‎
Lines changed: 1 addition & 0 deletions b/‎cli_redact.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎intros/long_intro.txt‎
Lines changed: 1 addition & 1 deletion b/‎intros/long_intro.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎intros/short_intro.txt‎
Lines changed: 1 addition & 1 deletion b/‎intros/short_intro.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 13 additions & 13 deletions b/‎pyproject.toml‎
Lines changed: 13 additions & 13 deletions
diff --git a/‎requirements.txt‎
Lines changed: 9 additions & 9 deletions b/‎requirements.txt‎
Lines changed: 9 additions & 9 deletions
@@ -40,3 +40,5 @@ test/usage/*
 model_cache/*
 sanitized_file/*
 src/doc_redaction.egg-info/*
+
+**/*.quarto_ipynb
@@ -201,7 +201,11 @@
 )
 from tools.custom_csvlogger import CSVLogger_custom
 from tools.data_anonymise import anonymise_files_with_open_text
-from tools.file_conversion import get_input_file_names, prepare_image_or_pdf
+from tools.file_conversion import (
+    get_document_file_names,
+    get_input_file_names,
+    prepare_image_or_pdf,
+)
 from tools.file_redaction import choose_and_run_redactor
 from tools.find_duplicate_pages import (
     apply_whole_page_redactions_from_list,
@@ -3973,15 +3977,19 @@ def show_tabular_info_box_on_click(
             with gr.Accordion("Log file outputs", open=False):
                 log_files_output = gr.File(label="Log file output", interactive=False)
 
-            with gr.Accordion("S3 output settings", open=False):
+            with gr.Accordion(
+                "S3 output settings", open=False, visible=SAVE_OUTPUTS_TO_S3
+            ):
                 save_outputs_to_s3_checkbox = gr.Checkbox(
-                    label="Save redaction outputs to S3 (requires RUN_AWS_FUNCTIONS=True and S3_OUTPUTS_FOLDER set)",
+                    label="Save redaction outputs to S3",
                     value=SAVE_OUTPUTS_TO_S3,
+                    visible=SAVE_OUTPUTS_TO_S3,
                 )
                 s3_output_folder_display = gr.Textbox(
-                    label="Resolved S3 outputs folder",
+                    label="S3 outputs folder",
                     value="",
                     interactive=False,
+                    visible=SAVE_OUTPUTS_TO_S3,
                 )
 
             with gr.Accordion("Combine multiple review files", open=False):
@@ -4358,8 +4366,23 @@ def handle_main_redaction_method_selection(redaction_method):
         }
     }"""
 
+    def check_duplicate_pages_checkbox(redact_duplicate_pages_checkbox_value: bool):
+        if not redact_duplicate_pages_checkbox_value:
+            # Silently raise an error to avoid showing a popup
+            return
+        if redact_duplicate_pages_checkbox_value:
+            print("Redact duplicate pages checkbox is enabled, identifying duplicates")
+            sys.tracebacklimit = 0  # Suppress traceback
+            raise ProcessStop(
+                "Redact duplicate pages checkbox is enabled, identifying duplicates."
+            )
+
+    def restore_sys_tracebacklimit():
+        sys.tracebacklimit = 1000  # Restore traceback limit
+        return
+
     in_doc_files.upload(
-        fn=get_input_file_names,
+        fn=get_document_file_names,
         inputs=[in_doc_files],
         outputs=[
             doc_file_name_no_extension_textbox,
@@ -4426,6 +4449,75 @@ def handle_main_redaction_method_selection(redaction_method):
         outputs=[relevant_ocr_output_with_words_found_checkbox],
     )
 
+    # Same process as above for walkthrough file input
+    walkthrough_file_input.upload(
+        fn=get_document_file_names,
+        inputs=[walkthrough_file_input],
+        outputs=[
+            doc_file_name_no_extension_textbox,
+            doc_file_name_with_extension_textbox,
+            doc_full_file_name_textbox,
+            doc_file_name_textbox_list,
+            total_pdf_page_count,
+        ],
+    ).success(
+        fn=prepare_image_or_pdf,
+        inputs=[
+            walkthrough_file_input,
+            text_extract_method_radio,
+            all_page_line_level_ocr_results_df_base,
+            all_page_line_level_ocr_results_with_words_df_base,
+            latest_file_completed_num,
+            redaction_output_summary_textbox,
+            first_loop_state,
+            annotate_max_pages,
+            all_image_annotations_state,
+            prepare_for_review_bool_false,
+            in_fully_redacted_list_state,
+            output_folder_textbox,
+            input_folder_textbox,
+            prepare_images_bool_false,
+            page_sizes,
+            pdf_doc_state,
+            page_min,
+            page_max,
+        ],
+        outputs=[
+            redaction_output_summary_textbox,
+            prepared_pdf_state,
+            images_pdf_state,
+            annotate_max_pages,
+            annotate_max_pages_bottom,
+            pdf_doc_state,
+            all_image_annotations_state,
+            review_file_df,
+            document_cropboxes,
+            page_sizes,
+            textract_output_found_checkbox,
+            all_img_details_state,
+            all_page_line_level_ocr_results_df_base,
+            relevant_ocr_output_with_words_found_checkbox,
+            all_page_line_level_ocr_results_with_words_df_base,
+        ],
+        show_progress_on=[redaction_output_summary_textbox],
+    ).success(
+        fn=check_for_existing_textract_file,
+        inputs=[
+            doc_file_name_no_extension_textbox,
+            output_folder_textbox,
+            handwrite_signature_checkbox,
+        ],
+        outputs=[textract_output_found_checkbox],
+    ).success(
+        fn=check_for_relevant_ocr_output_with_words,
+        inputs=[
+            doc_file_name_no_extension_textbox,
+            text_extract_method_radio,
+            output_folder_textbox,
+        ],
+        outputs=[relevant_ocr_output_with_words_found_checkbox],
+    )
+
     # Run redaction function
     document_redact_btn.click(
         fn=reset_state_vars,
@@ -4555,6 +4647,7 @@ def handle_main_redaction_method_selection(redaction_method):
             llm_model_name_textbox,
             llm_total_input_tokens_number,
             llm_total_output_tokens_number,
+            total_pdf_page_count,
         ],
         api_name="redact_doc",
         show_progress_on=[redaction_output_summary_textbox],
@@ -4599,21 +4692,16 @@ def handle_main_redaction_method_selection(redaction_method):
         ],
         show_progress_on=[annotator],
     )
-
-    def check_duplicate_pages_checkbox(redact_duplicate_pages_checkbox_value: bool):
-        if not redact_duplicate_pages_checkbox_value:
-            # Silently raise an error to avoid showing a popup
-            return
-        if redact_duplicate_pages_checkbox_value:
-            print("Redact duplicate pages checkbox is enabled, identifying duplicates")
-            sys.tracebacklimit = 0  # Suppress traceback
-            raise ProcessStop(
-                "Redact duplicate pages checkbox is enabled, identifying duplicates."
-            )
-
-    def restore_sys_tracebacklimit():
-        sys.tracebacklimit = 1000  # Restore traceback limit
-        return
+    # ).success(
+    #     fn=check_duplicate_pages_checkbox,
+    #     inputs=[redact_duplicate_pages_checkbox],
+    #     outputs=None,
+    # ).failure(
+    #     fn=lambda: None, js=TRIGGER_DUPLICATE_DETECTION_BUTTON
+    # ).then(
+    #     fn=restore_sys_tracebacklimit,
+    #     outputs=None,
+    # )
 
     # If a file has been completed, the function will continue onto the next document
     latest_file_completed_num.change(
@@ -4715,6 +4803,7 @@ def restore_sys_tracebacklimit():
             llm_model_name_textbox,
             llm_total_input_tokens_number,
             llm_total_output_tokens_number,
+            total_pdf_page_count,
         ],
         show_progress_on=[redaction_output_summary_textbox],
     ).success(
@@ -5098,6 +5187,7 @@ def restore_sys_tracebacklimit():
             llm_model_name_textbox,
             llm_total_input_tokens_number,
             llm_total_output_tokens_number,
+            total_pdf_page_count,
         ],
         show_progress_on=[redaction_output_summary_textbox],
     ).success(
@@ -5159,7 +5249,7 @@ def restore_sys_tracebacklimit():
         inputs=None,
         outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base],
     ).success(
-        fn=get_input_file_names,
+        fn=get_document_file_names,
         inputs=[input_pdf_for_review],
         outputs=[
             doc_file_name_no_extension_textbox,
@@ -7097,7 +7187,7 @@ def run_search_with_regex_option(
 
     # Convert review file to xfdf Adobe format
     convert_review_file_to_adobe_btn.click(
-        fn=get_input_file_names,
+        fn=get_document_file_names,
         inputs=[input_pdf_for_review],
         outputs=[
             doc_file_name_no_extension_textbox,
@@ -7170,7 +7260,7 @@ def run_search_with_regex_option(
 
     # Convert xfdf Adobe file back to review_file.csv
     convert_adobe_to_review_file_btn.click(
-        fn=get_input_file_names,
+        fn=get_document_file_names,
         inputs=[adobe_review_files_out],
         outputs=[
             doc_file_name_no_extension_textbox,
 
@@ -1228,6 +1228,7 @@ def main(direct_mode_args={}):
                     llm_model_name,
                     llm_total_input_tokens,
                     llm_total_output_tokens,
+                    _,
                 ) = choose_and_run_redactor(
                     file_paths=args.input_file,
                     prepared_pdf_file_paths=prepared_pdf_paths,
 
@@ -1,6 +1,6 @@
 # Document redaction
 
-Redact personally identifiable information (PII) from documents (pdf, png, jpg), Word files (docx), or tabular data (xlsx/csv/parquet). Please see the [User Guide]({USER_GUIDE_URL}) for a full walkthrough of all the features in the app.
+Redact personally identifiable information (PII) from documents (PDF, PNG, JPG), Word files (DOCX), or tabular data (XLSX/CSV/Parquet). Please see the [User Guide]({USER_GUIDE_URL}) for a full walkthrough of all the features in the app.
 
 To extract text from documents, the 'Local' options are PikePDF for PDFs with selectable text, and OCR with Tesseract. Use AWS Textract to extract more complex elements e.g. handwriting, signatures, or unclear text. For PII identification, 'Local' (based on spaCy) gives good results if you are looking for common names or terms, or a custom list of terms to redact (see Redaction settings).  AWS Comprehend gives better results at a small cost.
 
 
@@ -1,6 +1,6 @@
 # Document redaction
 
-Redact personally identifiable information (PII) from documents (pdf, png, jpg), Word files (docx), or tabular data (xlsx/csv/parquet). Please see the [User Guide]({USER_GUIDE_URL}) for a full walkthrough of all the features and settings.
+Redact personally identifiable information (PII) from documents (PDF, PNG, JPG), Word files (DOCX), or tabular data (XLSX/CSV/Parquet). Please see the [User Guide]({USER_GUIDE_URL}) for a full walkthrough of all the features and settings.
 
 To start, upload a document below (or click on an example), then click 'Extract text and redact document' to redact the document. Then, view and modify suggested redactions on the 'Review redactions' tab.
 
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "doc_redaction"
-version = "1.7.0"
+version = "1.7.1"
 description = "Redact PDF/image-based documents, Word, or CSV/XLSX files using a Gradio-based GUI interface"
 readme = "README.md"
 authors = [
@@ -38,21 +38,21 @@ classifiers = [
 ]
 requires-python = ">=3.10"
 dependencies = [
-    "pdfminer.six==20251107",
+    "pdfminer.six==20260107",
     "pdf2image==1.17.0",
-    "pymupdf==1.26.6",
+    "pymupdf==1.26.7",
     "bleach==6.3.0",
-    "opencv-python==4.12.0.88",
+    "opencv-python==4.13.0.90",
     "presidio_analyzer==2.2.360",
     "presidio_anonymizer==2.2.360",
     "presidio-image-redactor==0.0.57",
-    "pikepdf==9.11.0",
+    "pikepdf==10.3.0",
     "pandas==2.3.3",
-    "scikit-learn==1.7.2",
-    "spacy==3.8.8",
+    "scikit-learn<=1.8.0",
+    "spacy==3.8.11",
     "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
     "gradio==5.49.1",
-    "boto3==1.40.72",
+    "boto3==1.42.40",
     "pyarrow==21.0.0",
     "openpyxl==3.1.5",
     "Faker==37.8.0",
@@ -66,7 +66,7 @@ dependencies = [
     "polars==1.35.2",
     "defusedxml==0.7.1",
     "numpy==2.2.6",
-    "spaces==0.42.1",
+    "spaces==0.47.0",
     "google-genai>=1.52.0",
     "openai>=2.8.1",
     "markdown>=3.7"
@@ -93,10 +93,10 @@ paddle = [
 vlm = [
     "torch>=2.5.1,<=2.8.0", 
     "torchvision>=0.20.1",
-    "transformers==4.57.2",
-    "accelerate==1.11.0",
-    "bitsandbytes==0.48.2",
-    "sentencepiece==0.2.1", # Needed for PaddleOCRVL
+    "transformers==4.57.6",
+    "accelerate>=1.11.0",
+    "bitsandbytes>=0.48.2",
+    "sentencepiece>=0.2.1", # Needed for PaddleOCRVL
 ]
 
 # Run Gradio as an mcp server
 
@@ -5,7 +5,7 @@ bleach==6.3.0
 polars==1.35.2
 pyarrow==21.0.0
 openpyxl==3.1.5
-boto3==1.40.72
+boto3==1.42.40
 python-dotenv==1.0.1
 defusedxml==0.7.1
 Faker==37.8.0
@@ -15,11 +15,11 @@ markdown>=3.7
 tabulate>=0.9.0
 
 # --- PDF / OCR / Redaction tools ---
-pdfminer.six==20251107
+pdfminer.six==20260107
 pdf2image==1.17.0
-pymupdf==1.26.6
-pikepdf==9.11.0
-opencv-python==4.12.0.88
+pymupdf==1.26.7
+pikepdf==10.3.0
+opencv-python==4.13.0.90
 presidio_analyzer==2.2.360
 presidio_anonymizer==2.2.360
 presidio-image-redactor==0.0.57
@@ -30,17 +30,17 @@ python-docx==1.2.0
 # --- Gradio and apps ---
 gradio==5.49.1
 https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl  # Custom annotator version with rotation, zoom, labels, and box IDs
-spaces==0.42.1
+spaces==0.47.0
 
 # --- AWS Lambda runtime ---
 awslambdaric==3.1.1
 
 # --- Machine learning / NLP ---
-scikit-learn==1.7.2
-spacy==3.8.8
+scikit-learn<=1.8.0
+spacy==3.8.11
 spaczz==0.6.1
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
-transformers==4.57.2
+transformers==4.57.6
 accelerate==1.11.0
 bitsandbytes==0.48.2
 sentencepiece==0.2.1