|
201 | 201 | ) |
202 | 202 | from tools.custom_csvlogger import CSVLogger_custom |
203 | 203 | from tools.data_anonymise import anonymise_files_with_open_text |
204 | | -from tools.file_conversion import get_input_file_names, prepare_image_or_pdf |
| 204 | +from tools.file_conversion import ( |
| 205 | + get_document_file_names, |
| 206 | + get_input_file_names, |
| 207 | + prepare_image_or_pdf, |
| 208 | +) |
205 | 209 | from tools.file_redaction import choose_and_run_redactor |
206 | 210 | from tools.find_duplicate_pages import ( |
207 | 211 | apply_whole_page_redactions_from_list, |
@@ -3973,15 +3977,19 @@ def show_tabular_info_box_on_click( |
3973 | 3977 | with gr.Accordion("Log file outputs", open=False): |
3974 | 3978 | log_files_output = gr.File(label="Log file output", interactive=False) |
3975 | 3979 |
|
3976 | | - with gr.Accordion("S3 output settings", open=False): |
| 3980 | + with gr.Accordion( |
| 3981 | + "S3 output settings", open=False, visible=SAVE_OUTPUTS_TO_S3 |
| 3982 | + ): |
3977 | 3983 | save_outputs_to_s3_checkbox = gr.Checkbox( |
3978 | | - label="Save redaction outputs to S3 (requires RUN_AWS_FUNCTIONS=True and S3_OUTPUTS_FOLDER set)", |
| 3984 | + label="Save redaction outputs to S3", |
3979 | 3985 | value=SAVE_OUTPUTS_TO_S3, |
| 3986 | + visible=SAVE_OUTPUTS_TO_S3, |
3980 | 3987 | ) |
3981 | 3988 | s3_output_folder_display = gr.Textbox( |
3982 | | - label="Resolved S3 outputs folder", |
| 3989 | + label="S3 outputs folder", |
3983 | 3990 | value="", |
3984 | 3991 | interactive=False, |
| 3992 | + visible=SAVE_OUTPUTS_TO_S3, |
3985 | 3993 | ) |
3986 | 3994 |
|
3987 | 3995 | with gr.Accordion("Combine multiple review files", open=False): |
@@ -4358,8 +4366,23 @@ def handle_main_redaction_method_selection(redaction_method): |
4358 | 4366 | } |
4359 | 4367 | }""" |
4360 | 4368 |
|
| 4369 | + def check_duplicate_pages_checkbox(redact_duplicate_pages_checkbox_value: bool): |
| 4370 | + if not redact_duplicate_pages_checkbox_value: |
| 4371 | + # Silently raise an error to avoid showing a popup |
| 4372 | + return |
| 4373 | + if redact_duplicate_pages_checkbox_value: |
| 4374 | + print("Redact duplicate pages checkbox is enabled, identifying duplicates") |
| 4375 | + sys.tracebacklimit = 0 # Suppress traceback |
| 4376 | + raise ProcessStop( |
| 4377 | + "Redact duplicate pages checkbox is enabled, identifying duplicates." |
| 4378 | + ) |
| 4379 | + |
| 4380 | + def restore_sys_tracebacklimit(): |
| 4381 | + sys.tracebacklimit = 1000 # Restore traceback limit |
| 4382 | + return |
| 4383 | + |
4361 | 4384 | in_doc_files.upload( |
4362 | | - fn=get_input_file_names, |
| 4385 | + fn=get_document_file_names, |
4363 | 4386 | inputs=[in_doc_files], |
4364 | 4387 | outputs=[ |
4365 | 4388 | doc_file_name_no_extension_textbox, |
@@ -4426,6 +4449,75 @@ def handle_main_redaction_method_selection(redaction_method): |
4426 | 4449 | outputs=[relevant_ocr_output_with_words_found_checkbox], |
4427 | 4450 | ) |
4428 | 4451 |
|
| 4452 | + # Same process as above for walkthrough file input |
| 4453 | + walkthrough_file_input.upload( |
| 4454 | + fn=get_document_file_names, |
| 4455 | + inputs=[walkthrough_file_input], |
| 4456 | + outputs=[ |
| 4457 | + doc_file_name_no_extension_textbox, |
| 4458 | + doc_file_name_with_extension_textbox, |
| 4459 | + doc_full_file_name_textbox, |
| 4460 | + doc_file_name_textbox_list, |
| 4461 | + total_pdf_page_count, |
| 4462 | + ], |
| 4463 | + ).success( |
| 4464 | + fn=prepare_image_or_pdf, |
| 4465 | + inputs=[ |
| 4466 | + walkthrough_file_input, |
| 4467 | + text_extract_method_radio, |
| 4468 | + all_page_line_level_ocr_results_df_base, |
| 4469 | + all_page_line_level_ocr_results_with_words_df_base, |
| 4470 | + latest_file_completed_num, |
| 4471 | + redaction_output_summary_textbox, |
| 4472 | + first_loop_state, |
| 4473 | + annotate_max_pages, |
| 4474 | + all_image_annotations_state, |
| 4475 | + prepare_for_review_bool_false, |
| 4476 | + in_fully_redacted_list_state, |
| 4477 | + output_folder_textbox, |
| 4478 | + input_folder_textbox, |
| 4479 | + prepare_images_bool_false, |
| 4480 | + page_sizes, |
| 4481 | + pdf_doc_state, |
| 4482 | + page_min, |
| 4483 | + page_max, |
| 4484 | + ], |
| 4485 | + outputs=[ |
| 4486 | + redaction_output_summary_textbox, |
| 4487 | + prepared_pdf_state, |
| 4488 | + images_pdf_state, |
| 4489 | + annotate_max_pages, |
| 4490 | + annotate_max_pages_bottom, |
| 4491 | + pdf_doc_state, |
| 4492 | + all_image_annotations_state, |
| 4493 | + review_file_df, |
| 4494 | + document_cropboxes, |
| 4495 | + page_sizes, |
| 4496 | + textract_output_found_checkbox, |
| 4497 | + all_img_details_state, |
| 4498 | + all_page_line_level_ocr_results_df_base, |
| 4499 | + relevant_ocr_output_with_words_found_checkbox, |
| 4500 | + all_page_line_level_ocr_results_with_words_df_base, |
| 4501 | + ], |
| 4502 | + show_progress_on=[redaction_output_summary_textbox], |
| 4503 | + ).success( |
| 4504 | + fn=check_for_existing_textract_file, |
| 4505 | + inputs=[ |
| 4506 | + doc_file_name_no_extension_textbox, |
| 4507 | + output_folder_textbox, |
| 4508 | + handwrite_signature_checkbox, |
| 4509 | + ], |
| 4510 | + outputs=[textract_output_found_checkbox], |
| 4511 | + ).success( |
| 4512 | + fn=check_for_relevant_ocr_output_with_words, |
| 4513 | + inputs=[ |
| 4514 | + doc_file_name_no_extension_textbox, |
| 4515 | + text_extract_method_radio, |
| 4516 | + output_folder_textbox, |
| 4517 | + ], |
| 4518 | + outputs=[relevant_ocr_output_with_words_found_checkbox], |
| 4519 | + ) |
| 4520 | + |
4429 | 4521 | # Run redaction function |
4430 | 4522 | document_redact_btn.click( |
4431 | 4523 | fn=reset_state_vars, |
@@ -4555,6 +4647,7 @@ def handle_main_redaction_method_selection(redaction_method): |
4555 | 4647 | llm_model_name_textbox, |
4556 | 4648 | llm_total_input_tokens_number, |
4557 | 4649 | llm_total_output_tokens_number, |
| 4650 | + total_pdf_page_count, |
4558 | 4651 | ], |
4559 | 4652 | api_name="redact_doc", |
4560 | 4653 | show_progress_on=[redaction_output_summary_textbox], |
@@ -4599,21 +4692,16 @@ def handle_main_redaction_method_selection(redaction_method): |
4599 | 4692 | ], |
4600 | 4693 | show_progress_on=[annotator], |
4601 | 4694 | ) |
4602 | | - |
4603 | | - def check_duplicate_pages_checkbox(redact_duplicate_pages_checkbox_value: bool): |
4604 | | - if not redact_duplicate_pages_checkbox_value: |
4605 | | - # Silently raise an error to avoid showing a popup |
4606 | | - return |
4607 | | - if redact_duplicate_pages_checkbox_value: |
4608 | | - print("Redact duplicate pages checkbox is enabled, identifying duplicates") |
4609 | | - sys.tracebacklimit = 0 # Suppress traceback |
4610 | | - raise ProcessStop( |
4611 | | - "Redact duplicate pages checkbox is enabled, identifying duplicates." |
4612 | | - ) |
4613 | | - |
4614 | | - def restore_sys_tracebacklimit(): |
4615 | | - sys.tracebacklimit = 1000 # Restore traceback limit |
4616 | | - return |
| 4695 | + # ).success( |
| 4696 | + # fn=check_duplicate_pages_checkbox, |
| 4697 | + # inputs=[redact_duplicate_pages_checkbox], |
| 4698 | + # outputs=None, |
| 4699 | + # ).failure( |
| 4700 | + # fn=lambda: None, js=TRIGGER_DUPLICATE_DETECTION_BUTTON |
| 4701 | + # ).then( |
| 4702 | + # fn=restore_sys_tracebacklimit, |
| 4703 | + # outputs=None, |
| 4704 | + # ) |
4617 | 4705 |
|
4618 | 4706 | # If a file has been completed, the function will continue onto the next document |
4619 | 4707 | latest_file_completed_num.change( |
@@ -4715,6 +4803,7 @@ def restore_sys_tracebacklimit(): |
4715 | 4803 | llm_model_name_textbox, |
4716 | 4804 | llm_total_input_tokens_number, |
4717 | 4805 | llm_total_output_tokens_number, |
| 4806 | + total_pdf_page_count, |
4718 | 4807 | ], |
4719 | 4808 | show_progress_on=[redaction_output_summary_textbox], |
4720 | 4809 | ).success( |
@@ -5098,6 +5187,7 @@ def restore_sys_tracebacklimit(): |
5098 | 5187 | llm_model_name_textbox, |
5099 | 5188 | llm_total_input_tokens_number, |
5100 | 5189 | llm_total_output_tokens_number, |
| 5190 | + total_pdf_page_count, |
5101 | 5191 | ], |
5102 | 5192 | show_progress_on=[redaction_output_summary_textbox], |
5103 | 5193 | ).success( |
@@ -5159,7 +5249,7 @@ def restore_sys_tracebacklimit(): |
5159 | 5249 | inputs=None, |
5160 | 5250 | outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base], |
5161 | 5251 | ).success( |
5162 | | - fn=get_input_file_names, |
| 5252 | + fn=get_document_file_names, |
5163 | 5253 | inputs=[input_pdf_for_review], |
5164 | 5254 | outputs=[ |
5165 | 5255 | doc_file_name_no_extension_textbox, |
@@ -7097,7 +7187,7 @@ def run_search_with_regex_option( |
7097 | 7187 |
|
7098 | 7188 | # Convert review file to xfdf Adobe format |
7099 | 7189 | convert_review_file_to_adobe_btn.click( |
7100 | | - fn=get_input_file_names, |
| 7190 | + fn=get_document_file_names, |
7101 | 7191 | inputs=[input_pdf_for_review], |
7102 | 7192 | outputs=[ |
7103 | 7193 | doc_file_name_no_extension_textbox, |
@@ -7170,7 +7260,7 @@ def run_search_with_regex_option( |
7170 | 7260 |
|
7171 | 7261 | # Convert xfdf Adobe file back to review_file.csv |
7172 | 7262 | convert_adobe_to_review_file_btn.click( |
7173 | | - fn=get_input_file_names, |
| 7263 | + fn=get_document_file_names, |
7174 | 7264 | inputs=[adobe_review_files_out], |
7175 | 7265 | outputs=[ |
7176 | 7266 | doc_file_name_no_extension_textbox, |
|
0 commit comments