Skip to content

Commit ed25b66

Browse files
Optimised LLM prompting. Post LLM-entity detection text search also improved.
1 parent 4e70b4d commit ed25b66

File tree

5 files changed

+138
-234
lines changed

5 files changed

+138
-234
lines changed

app.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -316,7 +316,7 @@
316316
# Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
317317
CHOSEN_COMPREHEND_ENTITIES.extend(custom_entities)
318318
FULL_COMPREHEND_ENTITY_LIST.extend(custom_entities)
319-
# CHOSEN_LLM_ENTITIES.extend(custom_entities)
319+
FULL_LLM_ENTITY_LIST.extend(custom_entities)
320320

321321

322322
# 1. Create a custom error class
@@ -573,16 +573,12 @@ def change_tab_to_review_redactions():
573573

574574
walkthrough_custom_llm_instructions_textbox = gr.Textbox(
575575
label="Custom instructions for LLM-based entity detection",
576-
placeholder="e.g., 'don't redact anything related to Mark Wilson' or 'redact all company names with the label COMPANY_NAME'",
576+
placeholder="Specify new labels to redact with a description. E.g. 'Redact information related to Mark Wilson with the label MARK_WILSON' or 'redact all company names with the label COMPANY_NAME'.",
577577
value="",
578578
lines=3,
579579
visible=initial_is_llm_method,
580580
)
581581

582-
# Note: Accordion container removed to avoid block ID mismatches
583-
# Components are now rendered directly in the walkthrough
584-
585-
586582
## Redaction examples
587583
in_doc_files = gr.File(
588584
label="Choose a PDF document or image file (PDF, JPG, PNG)",
@@ -694,7 +690,7 @@ def change_tab_to_review_redactions():
694690

695691
custom_llm_instructions_textbox = gr.Textbox(
696692
label="Custom instructions for LLM-based entity detection",
697-
placeholder="Positive instructions are more likely to be successful than negative instructions. E.g. 'Redact information related to Mark Wilson with the label MARK_WILSON' or 'redact all company names with the label COMPANY_NAME' create labels you can filter by on the review screen, and are both more likely to be successful than 'Don't redact anything related to Mark Wilson' or 'Don't redact any company names.",
693+
placeholder="Specify new labels to redact with a description. E.g. 'Redact information related to Mark Wilson with the label MARK_WILSON' or 'redact all company names with the label COMPANY_NAME'.",
698694
value="",
699695
lines=3,
700696
visible=initial_is_llm_method,
@@ -2001,8 +1997,8 @@ def show_info_box_on_click(
20011997
0,
20021998
"paddle",
20031999
CHOSEN_REDACT_ENTITIES,
2004-
CHOSEN_LLM_ENTITIES,
2005-
"Redact personal information about Lauren with the label LAUREN. Redact any university names with the label UNIVERSITY.",
2000+
[],
2001+
"Redact Lauren's name, email addresses, and phone numbers with the label LAUREN. Redact university names with the label UNIVERSITY.",
20062002
],
20072003
)
20082004
ocr_example_labels.append("Example email LLM PII detection")
@@ -4673,6 +4669,8 @@ def restore_sys_tracebacklimit():
46734669
local_ocr_method_radio,
46744670
chosen_language_drop,
46754671
input_review_files,
4672+
custom_llm_instructions_textbox,
4673+
inference_server_vlm_model_textbox,
46764674
efficient_ocr_checkbox,
46774675
efficient_ocr_min_words_number,
46784676
],
@@ -5054,6 +5052,8 @@ def restore_sys_tracebacklimit():
50545052
local_ocr_method_radio,
50555053
chosen_language_drop,
50565054
input_review_files,
5055+
custom_llm_instructions_textbox,
5056+
inference_server_vlm_model_textbox,
50575057
efficient_ocr_checkbox,
50585058
efficient_ocr_min_words_number,
50595059
],

tools/config.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1463,7 +1463,7 @@ def update_model_choice_config(default_model_source, model_name_map):
14631463
# If set and non-empty, overrides CLOUD_LLM_PII_MODEL_CHOICE whenever custom instructions are passed to the LLM (e.g. allow-list style rules). Leave empty to always use CLOUD_LLM_PII_MODEL_CHOICE.
14641464
CLOUD_LLM_PII_CUSTOM_INSTRUCTIONS_MODEL_CHOICE = get_or_create_env_var(
14651465
"CLOUD_LLM_PII_CUSTOM_INSTRUCTIONS_MODEL_CHOICE",
1466-
"anthropic.claude-3-7-sonnet-20250219-v1:0", # Empty = use CLOUD_LLM_PII_MODEL_CHOICE even with custom instructions
1466+
"amazon.nova-pro-v1:0", # Empty = use CLOUD_LLM_PII_MODEL_CHOICE even with custom instructions
14671467
)
14681468

14691469
# Cloud LLM Model Choice for summarisation (AWS Bedrock / cloud)
@@ -1806,13 +1806,13 @@ def update_model_choice_config(default_model_source, model_name_map):
18061806

18071807
FULL_LLM_ENTITY_LIST = get_or_create_env_var(
18081808
"FULL_LLM_ENTITY_LIST",
1809-
"['EMAIL_ADDRESS','ADDRESS','NAME','PHONE_NUMBER', 'DATE_TIME', 'URL', 'IP_ADDRESS', 'MAC_ADDRESS', 'AGE', 'BANK_ACCOUNT_NUMBER', 'PASSPORT_NUMBER', 'CA_HEALTH_NUMBER', 'CUSTOM', 'CUSTOM_FUZZY']",
1809+
"['EMAIL_ADDRESS', 'STREET_ADDRESS','PERSON_NAME','PHONE_NUMBER', 'DATE_TIME', 'URL', 'IP_ADDRESS', 'AGE', 'BANK_ACCOUNT_NUMBER', 'PASSPORT_NUMBER', 'CUSTOM', 'CUSTOM_FUZZY']",
18101810
)
18111811

18121812
# Entities for LLM-based PII redaction option
18131813
CHOSEN_LLM_ENTITIES = get_or_create_env_var(
18141814
"CHOSEN_LLM_ENTITIES",
1815-
"['EMAIL_ADDRESS','ADDRESS','NAME','PHONE_NUMBER', 'CUSTOM']",
1815+
"['EMAIL_ADDRESS','STREET_ADDRESS','PERSON_NAME','PHONE_NUMBER', 'CUSTOM']",
18161816
)
18171817

18181818

tools/custom_image_analyser_engine.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -7682,7 +7682,7 @@ def analyze_text(
76827682
bedrock_runtime=bedrock_runtime,
76837683
language=aws_language,
76847684
allow_list=text_analyzer_kwargs.get("allow_list", []),
7685-
chosen_redact_comprehend_entities=llm_chosen_redact_comprehend_entities,
7685+
chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
76867686
all_text_line_results=all_text_line_results,
76877687
model_choice=model_choice,
76887688
temperature=text_analyzer_kwargs.get(
@@ -7784,7 +7784,7 @@ def analyze_text(
77847784
bedrock_runtime=bedrock_runtime,
77857785
language=aws_language,
77867786
allow_list=text_analyzer_kwargs.get("allow_list", []),
7787-
chosen_redact_comprehend_entities=llm_chosen_redact_comprehend_entities,
7787+
chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
77887788
all_text_line_results=all_text_line_results,
77897789
model_choice=model_choice,
77907790
temperature=text_analyzer_kwargs.get(
@@ -7861,7 +7861,7 @@ def analyze_text(
78617861
bedrock_runtime=bedrock_runtime,
78627862
language=aws_language,
78637863
allow_list=text_analyzer_kwargs.get("allow_list", []),
7864-
chosen_redact_comprehend_entities=llm_chosen_redact_comprehend_entities,
7864+
chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
78657865
all_text_line_results=all_text_line_results,
78667866
model_choice=model_choice,
78677867
temperature=text_analyzer_kwargs.get(
@@ -8043,7 +8043,7 @@ def analyze_text(
80438043
bedrock_runtime=bedrock_runtime,
80448044
language=aws_language,
80458045
allow_list=text_analyzer_kwargs.get("allow_list", []),
8046-
chosen_redact_comprehend_entities=llm_chosen_redact_comprehend_entities,
8046+
chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
80478047
all_text_line_results=all_text_line_results,
80488048
model_choice=model_choice,
80498049
temperature=text_analyzer_kwargs.get(
@@ -8145,7 +8145,7 @@ def analyze_text(
81458145
bedrock_runtime=bedrock_runtime,
81468146
language=aws_language,
81478147
allow_list=text_analyzer_kwargs.get("allow_list", []),
8148-
chosen_redact_comprehend_entities=llm_chosen_redact_comprehend_entities,
8148+
chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
81498149
all_text_line_results=all_text_line_results,
81508150
model_choice=model_choice,
81518151
temperature=text_analyzer_kwargs.get(
@@ -8222,7 +8222,7 @@ def analyze_text(
82228222
bedrock_runtime=bedrock_runtime,
82238223
language=aws_language,
82248224
allow_list=text_analyzer_kwargs.get("allow_list", []),
8225-
chosen_redact_comprehend_entities=llm_chosen_redact_comprehend_entities,
8225+
chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
82268226
all_text_line_results=all_text_line_results,
82278227
model_choice=model_choice,
82288228
temperature=text_analyzer_kwargs.get(
@@ -8433,7 +8433,7 @@ def analyze_text(
84338433
bedrock_runtime=bedrock_runtime,
84348434
language=aws_language,
84358435
allow_list=text_analyzer_kwargs.get("allow_list", []),
8436-
chosen_redact_comprehend_entities=llm_chosen_redact_comprehend_entities,
8436+
chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
84378437
all_text_line_results=all_text_line_results,
84388438
model_choice=model_choice,
84398439
temperature=text_analyzer_kwargs.get(
@@ -8535,7 +8535,7 @@ def analyze_text(
85358535
bedrock_runtime=bedrock_runtime,
85368536
language=aws_language,
85378537
allow_list=text_analyzer_kwargs.get("allow_list", []),
8538-
chosen_redact_comprehend_entities=llm_chosen_redact_comprehend_entities,
8538+
chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
85398539
all_text_line_results=all_text_line_results,
85408540
model_choice=model_choice,
85418541
temperature=text_analyzer_kwargs.get(
@@ -8612,7 +8612,7 @@ def analyze_text(
86128612
bedrock_runtime=bedrock_runtime,
86138613
language=aws_language,
86148614
allow_list=text_analyzer_kwargs.get("allow_list", []),
8615-
chosen_redact_comprehend_entities=llm_chosen_redact_comprehend_entities,
8615+
chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
86168616
all_text_line_results=all_text_line_results,
86178617
model_choice=model_choice,
86188618
temperature=text_analyzer_kwargs.get(
@@ -9633,7 +9633,7 @@ def run_page_text_redaction(
96339633
allow_list=text_analyzer_kwargs.get(
96349634
"allow_list", allow_list or []
96359635
),
9636-
chosen_redact_comprehend_entities=llm_chosen_redact_comprehend_entities,
9636+
chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
96379637
all_text_line_results=all_text_line_results,
96389638
model_choice=model_choice,
96399639
temperature=text_analyzer_kwargs.get(
@@ -9727,7 +9727,7 @@ def run_page_text_redaction(
97279727
allow_list=text_analyzer_kwargs.get(
97289728
"allow_list", allow_list or []
97299729
),
9730-
chosen_redact_comprehend_entities=llm_chosen_redact_comprehend_entities,
9730+
chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
97319731
all_text_line_results=all_text_line_results,
97329732
model_choice=model_choice,
97339733
temperature=text_analyzer_kwargs.get(
@@ -9793,7 +9793,7 @@ def run_page_text_redaction(
97939793
bedrock_runtime=bedrock_runtime,
97949794
language=aws_language,
97959795
allow_list=text_analyzer_kwargs.get("allow_list", allow_list or []),
9796-
chosen_redact_comprehend_entities=llm_chosen_redact_comprehend_entities,
9796+
chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
97979797
all_text_line_results=all_text_line_results,
97989798
model_choice=model_choice,
97999799
temperature=text_analyzer_kwargs.get(
@@ -9969,7 +9969,7 @@ def run_page_text_redaction(
99699969
allow_list=text_analyzer_kwargs.get(
99709970
"allow_list", allow_list or []
99719971
),
9972-
chosen_redact_comprehend_entities=llm_chosen_redact_comprehend_entities,
9972+
chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
99739973
all_text_line_results=all_text_line_results,
99749974
model_choice=model_choice,
99759975
temperature=text_analyzer_kwargs.get(
@@ -10063,7 +10063,7 @@ def run_page_text_redaction(
1006310063
allow_list=text_analyzer_kwargs.get(
1006410064
"allow_list", allow_list or []
1006510065
),
10066-
chosen_redact_comprehend_entities=llm_chosen_redact_comprehend_entities,
10066+
chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
1006710067
all_text_line_results=all_text_line_results,
1006810068
model_choice=model_choice,
1006910069
temperature=text_analyzer_kwargs.get(
@@ -10129,7 +10129,7 @@ def run_page_text_redaction(
1012910129
bedrock_runtime=bedrock_runtime,
1013010130
language=aws_language,
1013110131
allow_list=text_analyzer_kwargs.get("allow_list", allow_list or []),
10132-
chosen_redact_comprehend_entities=llm_chosen_redact_comprehend_entities,
10132+
chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
1013310133
all_text_line_results=all_text_line_results,
1013410134
model_choice=model_choice,
1013510135
temperature=text_analyzer_kwargs.get(
@@ -10336,7 +10336,7 @@ def run_page_text_redaction(
1033610336
allow_list=text_analyzer_kwargs.get(
1033710337
"allow_list", allow_list or []
1033810338
),
10339-
chosen_redact_comprehend_entities=llm_chosen_redact_comprehend_entities,
10339+
chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
1034010340
all_text_line_results=all_text_line_results,
1034110341
model_choice=model_choice,
1034210342
temperature=text_analyzer_kwargs.get(
@@ -10430,7 +10430,7 @@ def run_page_text_redaction(
1043010430
allow_list=text_analyzer_kwargs.get(
1043110431
"allow_list", allow_list or []
1043210432
),
10433-
chosen_redact_comprehend_entities=llm_chosen_redact_comprehend_entities,
10433+
chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
1043410434
all_text_line_results=all_text_line_results,
1043510435
model_choice=model_choice,
1043610436
temperature=text_analyzer_kwargs.get(
@@ -10496,7 +10496,7 @@ def run_page_text_redaction(
1049610496
bedrock_runtime=bedrock_runtime,
1049710497
language=aws_language,
1049810498
allow_list=text_analyzer_kwargs.get("allow_list", allow_list or []),
10499-
chosen_redact_comprehend_entities=llm_chosen_redact_comprehend_entities,
10499+
chosen_redact_llm_entities=llm_chosen_redact_comprehend_entities,
1050010500
all_text_line_results=all_text_line_results,
1050110501
model_choice=model_choice,
1050210502
temperature=text_analyzer_kwargs.get(

0 commit comments

Comments
 (0)