Skip to content

Commit cd53b50

Browse files
Added deduplication functions after each analysis batch with the ENABLE_BATCH_DEDUPLICATION variable to reduce max topics to suggested limits. LLM deduplication can now ignore sentiment. Other duplication fixes. Ensured that all temp files during xlsx creation are removed. Prompts now default to British English.
1 parent eddc355 commit cd53b50

File tree

8 files changed

+1048
-188
lines changed

8 files changed

+1048
-188
lines changed

app.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -923,7 +923,6 @@ def show_info_box_on_click(
923923

924924
with gr.Accordion("Response sentiment analysis", open=False):
925925
sentiment_checkbox = gr.Radio(
926-
label="Response sentiment analysis",
927926
value="Negative or Positive",
928927
choices=[
929928
"Negative or Positive",
@@ -1858,6 +1857,7 @@ def deduplicate_topics_llm_wrapper(
18581857
aws_secret_key_textbox="",
18591858
aws_region_textbox="",
18601859
azure_api_key_textbox="",
1860+
sentiment_checkbox="Negative or Positive",
18611861
):
18621862
# Ensure custom model_choice is registered in model_name_map
18631863
ensure_model_in_map(model_choice)
@@ -1889,6 +1889,7 @@ def deduplicate_topics_llm_wrapper(
18891889
aws_secret_key_textbox,
18901890
aws_region_textbox,
18911891
azure_api_key_textbox,
1892+
sentiment_checkbox=sentiment_checkbox,
18921893
)
18931894

18941895
deduplicate_llm_previous_data_btn.click(
@@ -1925,6 +1926,7 @@ def deduplicate_topics_llm_wrapper(
19251926
aws_secret_key_textbox,
19261927
aws_region_textbox,
19271928
azure_api_key_textbox,
1929+
sentiment_checkbox,
19281930
],
19291931
outputs=[
19301932
master_reference_df_state,

cli_topics.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
DIRECT_MODE_DEFAULT_COST_CODE,
2929
DIRECT_MODE_S3_UPLOAD_ONLY_XLSX,
3030
DYNAMODB_USAGE_LOG_HEADERS,
31+
ENABLE_BATCH_DEDUPLICATION,
3132
GEMINI_API_KEY,
3233
GRADIO_TEMP_DIR,
3334
HF_TOKEN,
@@ -36,6 +37,7 @@
3637
LLM_SEED,
3738
LLM_TEMPERATURE,
3839
MAX_TIME_FOR_LOOP,
40+
MAXIMUM_ZERO_SHOT_TOPICS,
3941
OUTPUT_DEBUG_FILES,
4042
OUTPUT_FOLDER,
4143
RUN_AWS_FUNCTIONS,
@@ -825,6 +827,17 @@ def main(direct_mode_args={}):
825827
default="",
826828
help="Additional instructions for summary format.",
827829
)
830+
extract_group.add_argument(
831+
"--enable_batch_deduplication",
832+
default=ENABLE_BATCH_DEDUPLICATION,
833+
help=f"Enable deduplication after each batch during topic extraction (True/False). Default: {ENABLE_BATCH_DEDUPLICATION}",
834+
)
835+
extract_group.add_argument(
836+
"--maximum_zero_shot_topics",
837+
type=int,
838+
default=MAXIMUM_ZERO_SHOT_TOPICS,
839+
help=f"Maximum number of topics before triggering LLM-based deduplication. Default: {MAXIMUM_ZERO_SHOT_TOPICS}",
840+
)
828841

829842
# --- Validation Arguments ---
830843
validate_group = parser.add_argument_group("Topic Validation Options")
@@ -1042,6 +1055,15 @@ def main(direct_mode_args={}):
10421055
args.sample_reference_table = args.sample_reference_table == "True"
10431056
args.output_debug_files = args.output_debug_files == "True"
10441057

1058+
# Update config module values for batch deduplication settings if provided via CLI
1059+
# These need to be set before importing/using llm_api_call functions
1060+
import tools.config as config_module
1061+
1062+
if hasattr(args, "enable_batch_deduplication"):
1063+
config_module.ENABLE_BATCH_DEDUPLICATION = args.enable_batch_deduplication
1064+
if hasattr(args, "maximum_zero_shot_topics"):
1065+
config_module.MAXIMUM_ZERO_SHOT_TOPICS = args.maximum_zero_shot_topics
1066+
10451067
# Get username and folders
10461068
(
10471069
session_hash,
@@ -1478,6 +1500,11 @@ def main(direct_mode_args={}):
14781500
args.azure_api_key if hasattr(args, "azure_api_key") else ""
14791501
),
14801502
model_name_map=model_name_map,
1503+
sentiment_checkbox=(
1504+
args.sentiment
1505+
if hasattr(args, "sentiment")
1506+
else "Negative or Positive"
1507+
),
14811508
)
14821509

14831510
end_time = time.time()

tools/combine_sheets_into_xlsx.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -422,13 +422,16 @@ def collect_output_csvs_and_create_excel_output(
422422
missing_df_state_csv_path = ""
423423
overall_summary_csv_path = ""
424424
number_of_responses_with_topic_assignment = 0
425+
# Track all temporary CSV files created for xlsx conversion
426+
temp_csv_files_for_cleanup = list()
425427

426428
if in_group_col:
427429
group = in_group_col
428430
else:
429431
group = "All"
430432

431433
overall_summary_csv_path = output_folder + "overall_summary_for_xlsx.csv"
434+
temp_csv_files_for_cleanup.append(overall_summary_csv_path)
432435

433436
if structured_summaries is True and not master_unique_topics_df_state.empty:
434437
print("Producing overall summary based on structured summaries.")
@@ -512,11 +515,13 @@ def collect_output_csvs_and_create_excel_output(
512515

513516
reference_table_csv_path = output_folder + "reference_df_for_xlsx.csv"
514517
master_reference_df_state.to_csv(reference_table_csv_path, index=None)
518+
temp_csv_files_for_cleanup.append(reference_table_csv_path)
515519

516520
reference_pivot_table_csv_path = (
517521
output_folder + "reference_pivot_df_for_xlsx.csv"
518522
)
519523
reference_pivot_table.to_csv(reference_pivot_table_csv_path, index=None)
524+
temp_csv_files_for_cleanup.append(reference_pivot_table_csv_path)
520525

521526
short_file_name = os.path.basename(file_name)
522527

@@ -530,6 +535,7 @@ def collect_output_csvs_and_create_excel_output(
530535
output_folder + "unique_topic_table_df_for_xlsx.csv"
531536
)
532537
master_unique_topics_df_state.to_csv(unique_topic_table_csv_path, index=None)
538+
temp_csv_files_for_cleanup.append(unique_topic_table_csv_path)
533539

534540
if unique_topic_table_csv_path:
535541
csv_files.append(unique_topic_table_csv_path)
@@ -596,6 +602,7 @@ def collect_output_csvs_and_create_excel_output(
596602
if not missing_df_state.empty:
597603
missing_df_state_csv_path = output_folder + "missing_df_state_df_for_xlsx.csv"
598604
missing_df_state.to_csv(missing_df_state_csv_path, index=None)
605+
temp_csv_files_for_cleanup.append(missing_df_state_csv_path)
599606

600607
if missing_df_state_csv_path:
601608
if structured_summaries:
@@ -610,8 +617,6 @@ def collect_output_csvs_and_create_excel_output(
610617
else:
611618
print("Relevant missing responses files not found, excluding from xlsx output.")
612619

613-
new_csv_files = csv_files.copy()
614-
615620
# Original data file
616621
original_ext = os.path.splitext(original_data_file_path)[1].lower()
617622
if original_ext == ".csv":
@@ -636,6 +641,7 @@ def collect_output_csvs_and_create_excel_output(
636641
)
637642
df.to_csv(original_data_csv_path, index=False)
638643
csv_files.append(original_data_csv_path)
644+
temp_csv_files_for_cleanup.append(original_data_csv_path)
639645

640646
sheet_names.append("Original data")
641647
column_widths["Original data"] = {"A": 10, "B": 20, "C": 20}
@@ -768,8 +774,12 @@ def collect_output_csvs_and_create_excel_output(
768774

769775
xlsx_output_filenames = [xlsx_output_filename]
770776

771-
# Delete intermediate csv files
772-
for csv_file in new_csv_files:
773-
os.remove(csv_file)
777+
# Delete all intermediate '_for_xlsx.csv' files
778+
for csv_file in temp_csv_files_for_cleanup:
779+
try:
780+
if os.path.exists(csv_file):
781+
os.remove(csv_file)
782+
except Exception as e:
783+
print(f"Could not delete temporary CSV file '{csv_file}' due to: {e}")
774784

775785
return xlsx_output_filenames, xlsx_output_filenames

tools/config.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -322,7 +322,7 @@ def convert_string_to_boolean(value: str) -> bool:
322322
get_or_create_env_var("BATCH_SIZE_DEFAULT", "5")
323323
) # Default batch size for LLM calls
324324
MAXIMUM_ZERO_SHOT_TOPICS = int(
325-
get_or_create_env_var("MAXIMUM_ZERO_SHOT_TOPICS", "120")
325+
get_or_create_env_var("MAXIMUM_ZERO_SHOT_TOPICS", "100")
326326
) # Maximum number of zero shot topics to process
327327
MAX_SPACES_GPU_RUN_TIME = int(
328328
get_or_create_env_var("MAX_SPACES_GPU_RUN_TIME", "240")
@@ -332,6 +332,10 @@ def convert_string_to_boolean(value: str) -> bool:
332332
get_or_create_env_var("DEDUPLICATION_THRESHOLD", "90")
333333
) # Deduplication threshold for topic summary tables
334334

335+
ENABLE_BATCH_DEDUPLICATION = convert_string_to_boolean(
336+
get_or_create_env_var("ENABLE_BATCH_DEDUPLICATION", "False")
337+
) # Whether to deduplicate topics after each batch during extraction
338+
335339
###
336340
# Model options
337341
###
@@ -724,7 +728,6 @@ def update_model_choice_config(default_model_source, model_name_map):
724728
LLM_MIN_P = float(get_or_create_env_var("LLM_MIN_P", "0"))
725729
LLM_TOP_P = float(get_or_create_env_var("LLM_TOP_P", "0.95"))
726730
LLM_REPETITION_PENALTY = float(get_or_create_env_var("LLM_REPETITION_PENALTY", "1.0"))
727-
728731
LLM_LAST_N_TOKENS = int(get_or_create_env_var("LLM_LAST_N_TOKENS", "512"))
729732
LLM_MAX_NEW_TOKENS = int(get_or_create_env_var("LLM_MAX_NEW_TOKENS", "4096"))
730733
LLM_SEED = int(get_or_create_env_var("LLM_SEED", "42"))

0 commit comments

Comments
 (0)