seanpedrick-case
diff --git a/‎app.py‎
Lines changed: 3 additions & 1 deletion b/‎app.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎cli_topics.py‎
Lines changed: 27 additions & 0 deletions b/‎cli_topics.py‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎tools/combine_sheets_into_xlsx.py‎
Lines changed: 15 additions & 5 deletions b/‎tools/combine_sheets_into_xlsx.py‎
Lines changed: 15 additions & 5 deletions
diff --git a/‎tools/config.py‎
Lines changed: 5 additions & 2 deletions b/‎tools/config.py‎
Lines changed: 5 additions & 2 deletions
@@ -923,7 +923,6 @@ def show_info_box_on_click(
 
         with gr.Accordion("Response sentiment analysis", open=False):
             sentiment_checkbox = gr.Radio(
-                label="Response sentiment analysis",
                 value="Negative or Positive",
                 choices=[
                     "Negative or Positive",
@@ -1858,6 +1857,7 @@ def deduplicate_topics_llm_wrapper(
         aws_secret_key_textbox="",
         aws_region_textbox="",
         azure_api_key_textbox="",
+        sentiment_checkbox="Negative or Positive",
     ):
         # Ensure custom model_choice is registered in model_name_map
         ensure_model_in_map(model_choice)
@@ -1889,6 +1889,7 @@ def deduplicate_topics_llm_wrapper(
             aws_secret_key_textbox,
             aws_region_textbox,
             azure_api_key_textbox,
+            sentiment_checkbox=sentiment_checkbox,
         )
 
     deduplicate_llm_previous_data_btn.click(
@@ -1925,6 +1926,7 @@ def deduplicate_topics_llm_wrapper(
             aws_secret_key_textbox,
             aws_region_textbox,
             azure_api_key_textbox,
+            sentiment_checkbox,
         ],
         outputs=[
             master_reference_df_state,
 
@@ -28,6 +28,7 @@
     DIRECT_MODE_DEFAULT_COST_CODE,
     DIRECT_MODE_S3_UPLOAD_ONLY_XLSX,
     DYNAMODB_USAGE_LOG_HEADERS,
+    ENABLE_BATCH_DEDUPLICATION,
     GEMINI_API_KEY,
     GRADIO_TEMP_DIR,
     HF_TOKEN,
@@ -36,6 +37,7 @@
     LLM_SEED,
     LLM_TEMPERATURE,
     MAX_TIME_FOR_LOOP,
+    MAXIMUM_ZERO_SHOT_TOPICS,
     OUTPUT_DEBUG_FILES,
     OUTPUT_FOLDER,
     RUN_AWS_FUNCTIONS,
@@ -825,6 +827,17 @@ def main(direct_mode_args={}):
         default="",
         help="Additional instructions for summary format.",
     )
+    extract_group.add_argument(
+        "--enable_batch_deduplication",
+        default=ENABLE_BATCH_DEDUPLICATION,
+        help=f"Enable deduplication after each batch during topic extraction (True/False). Default: {ENABLE_BATCH_DEDUPLICATION}",
+    )
+    extract_group.add_argument(
+        "--maximum_zero_shot_topics",
+        type=int,
+        default=MAXIMUM_ZERO_SHOT_TOPICS,
+        help=f"Maximum number of topics before triggering LLM-based deduplication. Default: {MAXIMUM_ZERO_SHOT_TOPICS}",
+    )
 
     # --- Validation Arguments ---
     validate_group = parser.add_argument_group("Topic Validation Options")
@@ -1042,6 +1055,15 @@ def main(direct_mode_args={}):
     args.sample_reference_table = args.sample_reference_table == "True"
     args.output_debug_files = args.output_debug_files == "True"
 
+    # Update config module values for batch deduplication settings if provided via CLI
+    # These need to be set before importing/using llm_api_call functions
+    import tools.config as config_module
+
+    if hasattr(args, "enable_batch_deduplication"):
+        config_module.ENABLE_BATCH_DEDUPLICATION = args.enable_batch_deduplication
+    if hasattr(args, "maximum_zero_shot_topics"):
+        config_module.MAXIMUM_ZERO_SHOT_TOPICS = args.maximum_zero_shot_topics
+
     # Get username and folders
     (
         session_hash,
@@ -1478,6 +1500,11 @@ def main(direct_mode_args={}):
                         args.azure_api_key if hasattr(args, "azure_api_key") else ""
                     ),
                     model_name_map=model_name_map,
+                    sentiment_checkbox=(
+                        args.sentiment
+                        if hasattr(args, "sentiment")
+                        else "Negative or Positive"
+                    ),
                 )
 
             end_time = time.time()
 
@@ -422,13 +422,16 @@ def collect_output_csvs_and_create_excel_output(
     missing_df_state_csv_path = ""
     overall_summary_csv_path = ""
     number_of_responses_with_topic_assignment = 0
+    # Track all temporary CSV files created for xlsx conversion
+    temp_csv_files_for_cleanup = list()
 
     if in_group_col:
         group = in_group_col
     else:
         group = "All"
 
     overall_summary_csv_path = output_folder + "overall_summary_for_xlsx.csv"
+    temp_csv_files_for_cleanup.append(overall_summary_csv_path)
 
     if structured_summaries is True and not master_unique_topics_df_state.empty:
         print("Producing overall summary based on structured summaries.")
@@ -512,11 +515,13 @@ def collect_output_csvs_and_create_excel_output(
 
         reference_table_csv_path = output_folder + "reference_df_for_xlsx.csv"
         master_reference_df_state.to_csv(reference_table_csv_path, index=None)
+        temp_csv_files_for_cleanup.append(reference_table_csv_path)
 
         reference_pivot_table_csv_path = (
             output_folder + "reference_pivot_df_for_xlsx.csv"
         )
         reference_pivot_table.to_csv(reference_pivot_table_csv_path, index=None)
+        temp_csv_files_for_cleanup.append(reference_pivot_table_csv_path)
 
         short_file_name = os.path.basename(file_name)
 
@@ -530,6 +535,7 @@ def collect_output_csvs_and_create_excel_output(
             output_folder + "unique_topic_table_df_for_xlsx.csv"
         )
         master_unique_topics_df_state.to_csv(unique_topic_table_csv_path, index=None)
+        temp_csv_files_for_cleanup.append(unique_topic_table_csv_path)
 
     if unique_topic_table_csv_path:
         csv_files.append(unique_topic_table_csv_path)
@@ -596,6 +602,7 @@ def collect_output_csvs_and_create_excel_output(
     if not missing_df_state.empty:
         missing_df_state_csv_path = output_folder + "missing_df_state_df_for_xlsx.csv"
         missing_df_state.to_csv(missing_df_state_csv_path, index=None)
+        temp_csv_files_for_cleanup.append(missing_df_state_csv_path)
 
     if missing_df_state_csv_path:
         if structured_summaries:
@@ -610,8 +617,6 @@ def collect_output_csvs_and_create_excel_output(
     else:
         print("Relevant missing responses files not found, excluding from xlsx output.")
 
-    new_csv_files = csv_files.copy()
-
     # Original data file
     original_ext = os.path.splitext(original_data_file_path)[1].lower()
     if original_ext == ".csv":
@@ -636,6 +641,7 @@ def collect_output_csvs_and_create_excel_output(
         )
         df.to_csv(original_data_csv_path, index=False)
         csv_files.append(original_data_csv_path)
+        temp_csv_files_for_cleanup.append(original_data_csv_path)
 
     sheet_names.append("Original data")
     column_widths["Original data"] = {"A": 10, "B": 20, "C": 20}
@@ -768,8 +774,12 @@ def collect_output_csvs_and_create_excel_output(
 
     xlsx_output_filenames = [xlsx_output_filename]
 
-    # Delete intermediate csv files
-    for csv_file in new_csv_files:
-        os.remove(csv_file)
+    # Delete all intermediate '_for_xlsx.csv' files
+    for csv_file in temp_csv_files_for_cleanup:
+        try:
+            if os.path.exists(csv_file):
+                os.remove(csv_file)
+        except Exception as e:
+            print(f"Could not delete temporary CSV file '{csv_file}' due to: {e}")
 
     return xlsx_output_filenames, xlsx_output_filenames
@@ -322,7 +322,7 @@ def convert_string_to_boolean(value: str) -> bool:
     get_or_create_env_var("BATCH_SIZE_DEFAULT", "5")
 )  # Default batch size for LLM calls
 MAXIMUM_ZERO_SHOT_TOPICS = int(
-    get_or_create_env_var("MAXIMUM_ZERO_SHOT_TOPICS", "120")
+    get_or_create_env_var("MAXIMUM_ZERO_SHOT_TOPICS", "100")
 )  # Maximum number of zero shot topics to process
 MAX_SPACES_GPU_RUN_TIME = int(
     get_or_create_env_var("MAX_SPACES_GPU_RUN_TIME", "240")
@@ -332,6 +332,10 @@ def convert_string_to_boolean(value: str) -> bool:
     get_or_create_env_var("DEDUPLICATION_THRESHOLD", "90")
 )  # Deduplication threshold for topic summary tables
 
+ENABLE_BATCH_DEDUPLICATION = convert_string_to_boolean(
+    get_or_create_env_var("ENABLE_BATCH_DEDUPLICATION", "False")
+)  # Whether to deduplicate topics after each batch during extraction
+
 ###
 # Model options
 ###
@@ -724,7 +728,6 @@ def update_model_choice_config(default_model_source, model_name_map):
 LLM_MIN_P = float(get_or_create_env_var("LLM_MIN_P", "0"))
 LLM_TOP_P = float(get_or_create_env_var("LLM_TOP_P", "0.95"))
 LLM_REPETITION_PENALTY = float(get_or_create_env_var("LLM_REPETITION_PENALTY", "1.0"))
-
 LLM_LAST_N_TOKENS = int(get_or_create_env_var("LLM_LAST_N_TOKENS", "512"))
 LLM_MAX_NEW_TOKENS = int(get_or_create_env_var("LLM_MAX_NEW_TOKENS", "4096"))
 LLM_SEED = int(get_or_create_env_var("LLM_SEED", "42"))