Merge pull request #65 from seanpedrick-case/dev

seanpedrick-case · web-flow · commit 8daacf38dfbd · 2026-01-09T09:59:37.000Z
Rollback to Gradio 6.0.2 as some features degraded. Improved LLM dedu…
diff --git a/Dockerfile b/Dockerfile
@@ -1,6 +1,7 @@
-# This Dockerfile is optimised for AWS ECS using Python 3.11, and assumes CPU inference with OpenBLAS for local models.
+# This Dockerfile is optimised for AWS ECS using Python 3.12, and assumes CUDA 12.6 for local models. The Dockerfile will need to be modified to install all linux CUDA / GPU dependencies.
+
 # Stage 1: Build dependencies and download models
-FROM public.ecr.aws/docker/library/python:3.11.13-slim-trixie AS builder
+FROM public.ecr.aws/docker/library/python:3.12.12-slim-trixie AS builder
 
 # Install system dependencies.
 RUN apt-get update && apt-get install -y \
@@ -19,22 +20,20 @@ WORKDIR /src
 
 COPY requirements_lightweight.txt .
 
-# Set environment variables for OpenBLAS - not necessary if not building from source
-# ENV OPENBLAS_VERBOSE=1
-# ENV CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS"
-
 ARG INSTALL_TORCH=False
 ENV INSTALL_TORCH=${INSTALL_TORCH}
 
+# Local torch install requires CUDA 12.6
 RUN if [ "$INSTALL_TORCH" = "True" ]; then \
-    pip install --no-cache-dir --target=/install torch==2.9.1+cpu --extra-index-url https://download.pytorch.org/whl/cpu; \
+    pip install --no-cache-dir --target=/install torch==2.9.1 --extra-index-url https://download.pytorch.org/whl/cu126; \
     fi
 
 ARG INSTALL_LLAMA_CPP_PYTHON=False
 ENV INSTALL_LLAMA_CPP_PYTHON=${INSTALL_LLAMA_CPP_PYTHON}
 
+# Llama CPP Python install requires CUDA 12.4
 RUN if [ "$INSTALL_LLAMA_CPP_PYTHON" = "True" ]; then \
-    pip install --no-cache-dir --target=/install https://github.com/seanpedrick-case/llama-cpp-python-whl-builder/releases/download/v0.1.0/llama_cpp_python-0.3.16-cp311-cp311-linux_x86_64.whl; \
+    pip install --no-cache-dir --target=/install https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.16-cu124/llama_cpp_python-0.3.16-cp312-cp312-linux_x86_64.whl; \
     fi
 
 RUN pip install --no-cache-dir --target=/install -r requirements_lightweight.txt
@@ -44,7 +43,7 @@ RUN rm requirements_lightweight.txt
 # ===================================================================
 # Stage 2: A common 'base' for both Lambda and Gradio
 # ===================================================================
-FROM public.ecr.aws/docker/library/python:3.11.13-slim-trixie AS base
+FROM public.ecr.aws/docker/library/python:3.12.12-slim-trixie AS base
 
 # Set build-time and runtime environment variable for whether to run in Gradio mode or Lambda mode
 ARG APP_MODE=gradio
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@ emoji: 📚
 colorFrom: purple
 colorTo: yellow
 sdk: gradio
-sdk_version: 6.2.0
+sdk_version: 6.0.2
 app_file: app.py
 pinned: true
 license: agpl-3.0
@@ -13,7 +13,7 @@ short_description: Create thematic summaries for open text data with LLMs
 
 # Large language model topic modelling
 
-Version: 0.8.0
+Version: 0.9.0
 
 Extract topics and summarise outputs using Large Language Models (LLMs), either local, Gemini, Azure, or AWS Bedrock models (e.g. Claude, Nova models). The app will query the LLM with batches of responses to produce summary tables, which are then compared iteratively to output a table with the general topics, subtopics, topic sentiment, and a topic summary. Instructions on use can be found in the README.md file. You can try out examples by clicking on one of the example datasets on the main app page, which will show you example outputs from a local model run. API keys for AWS, Azure, and Gemini services can be entered on the settings page (note that Gemini has a free public API).
 
diff --git a/app.py b/app.py
@@ -6,7 +6,7 @@
 
 from tools.auth import authenticate_user
 from tools.aws_functions import (
-    download_file_from_s3,
+    download_cost_codes_with_error_handling,
     export_outputs_to_s3,
     upload_file_to_s3,
 )
@@ -83,6 +83,7 @@
     LOG_FILE_NAME,
     MAX_FILE_SIZE,
     MAX_QUEUE_SIZE,
+    MAXIMUM_ALLOWED_TOPICS,
     MPLCONFIGDIR,
     OUTPUT_COST_CODES_PATH,
     OUTPUT_DEBUG_FILES,
@@ -119,7 +120,6 @@
 from tools.custom_csvlogger import CSVLogger_custom
 from tools.dedup_summaries import (
     deduplicate_topics,
-    deduplicate_topics_llm,
     overall_summary,
     wrapper_summarise_output_topics_per_group,
 )
@@ -135,7 +135,6 @@
     empty_output_vars_extract_topics,
     empty_output_vars_summarise,
     enforce_cost_codes,
-    ensure_model_in_map,
     get_connection_params,
     join_cols_onto_reference_df,
     load_in_data_file,
@@ -151,6 +150,7 @@
 )
 from tools.llm_api_call import (
     all_in_one_pipeline,
+    deduplicate_topics_llm_wrapper,
     modify_existing_output_tables,
     validate_topics_wrapper,
     wrapper_extract_topics_per_column_value,
@@ -923,6 +923,7 @@ def show_info_box_on_click(
 
         with gr.Accordion("Response sentiment analysis", open=False):
             sentiment_checkbox = gr.Radio(
+                label="Should the model assess the sentiment of responses?",
                 value="Negative or Positive",
                 choices=[
                     "Negative or Positive",
@@ -1268,7 +1269,15 @@ def show_info_box_on_click(
                     precision=1,
                     step=0.1,
                 )
+            with gr.Row(equal_height=True):
                 batch_size_number.render()
+                max_topics_number = gr.Number(
+                    value=MAXIMUM_ALLOWED_TOPICS,
+                    label="Maximum number of topics allowed. If exceeded, the LLM will make efforts to deduplicate topics after every batch until the total number of topics is below this number (not foolproof).",
+                    precision=0,
+                    minimum=1,
+                    maximum=1000,
+                )
             random_seed = gr.Number(
                 value=LLM_SEED, label="Random seed for LLM generation", visible="hidden"
             )
@@ -1533,6 +1542,7 @@ def show_info_box_on_click(
             additional_validation_issues_textbox,
             show_previous_table_radio,
             api_url_textbox,
+            max_topics_number,
         ],
         outputs=[
             display_topic_table_markdown,
@@ -1676,6 +1686,7 @@ def show_info_box_on_click(
             aws_secret_key_textbox,
             aws_region_textbox,
             api_url_textbox,
+            max_topics_number,
         ],
         outputs=[
             display_topic_table_markdown,
@@ -1835,63 +1846,6 @@ def show_info_box_on_click(
         api_name="deduplicate_topics",
     )
 
-    # When LLM deduplication button pressed, deduplicate data using LLM
-    def deduplicate_topics_llm_wrapper(
-        reference_df,
-        topic_summary_df,
-        reference_table_file_name,
-        unique_topics_table_file_name,
-        model_choice,
-        in_api_key,
-        temperature,
-        in_excel_sheets,
-        merge_sentiment,
-        merge_general_topics,
-        in_data_files,
-        chosen_cols,
-        output_folder,
-        candidate_topics=None,
-        azure_endpoint="",
-        api_url=None,
-        aws_access_key_textbox="",
-        aws_secret_key_textbox="",
-        aws_region_textbox="",
-        azure_api_key_textbox="",
-        sentiment_checkbox="Negative or Positive",
-    ):
-        # Ensure custom model_choice is registered in model_name_map
-        ensure_model_in_map(model_choice)
-        model_source = model_name_map[model_choice]["source"]
-        return deduplicate_topics_llm(
-            reference_df,
-            topic_summary_df,
-            reference_table_file_name,
-            unique_topics_table_file_name,
-            model_choice,
-            in_api_key,
-            temperature,
-            model_source,
-            None,
-            None,
-            None,
-            None,
-            in_excel_sheets,
-            merge_sentiment,
-            merge_general_topics,
-            in_data_files,
-            chosen_cols,
-            output_folder,
-            candidate_topics,
-            azure_endpoint,
-            OUTPUT_DEBUG_FILES,
-            api_url,
-            aws_access_key_textbox,
-            aws_secret_key_textbox,
-            aws_region_textbox,
-            azure_api_key_textbox,
-            sentiment_checkbox=sentiment_checkbox,
-        )
-
     deduplicate_llm_previous_data_btn.click(
         load_in_previous_data_files,
         inputs=[deduplication_input_files],
@@ -2511,16 +2465,6 @@ def deduplicate_topics_llm_wrapper(
                 f"Attempting to download from bucket: {S3_LOG_BUCKET}, key: {S3_COST_CODES_PATH}"
             )
 
-            # Create a wrapper function with error handling
-            def download_cost_codes_with_error_handling(bucket, key, local_path):
-                try:
-                    download_file_from_s3(bucket, key, local_path)
-                    return True
-                except Exception as e:
-                    print(f"Error downloading cost codes from S3: {e}")
-                    print(f"Failed to download s3://{bucket}/{key}")
-                    return False
-
             app.load(
                 download_cost_codes_with_error_handling,
                 inputs=[
diff --git a/cli_topics.py b/cli_topics.py
@@ -37,7 +37,7 @@
     LLM_SEED,
     LLM_TEMPERATURE,
     MAX_TIME_FOR_LOOP,
-    MAXIMUM_ZERO_SHOT_TOPICS,
+    MAXIMUM_ALLOWED_TOPICS,
     OUTPUT_DEBUG_FILES,
     OUTPUT_FOLDER,
     RUN_AWS_FUNCTIONS,
@@ -833,10 +833,10 @@ def main(direct_mode_args={}):
         help=f"Enable deduplication after each batch during topic extraction (True/False). Default: {ENABLE_BATCH_DEDUPLICATION}",
     )
     extract_group.add_argument(
-        "--maximum_zero_shot_topics",
+        "--maximum_allowed_topics",
         type=int,
-        default=MAXIMUM_ZERO_SHOT_TOPICS,
-        help=f"Maximum number of topics before triggering LLM-based deduplication. Default: {MAXIMUM_ZERO_SHOT_TOPICS}",
+        default=MAXIMUM_ALLOWED_TOPICS,
+        help=f"Maximum number of topics before triggering LLM-based deduplication. Default: {MAXIMUM_ALLOWED_TOPICS}",
     )
 
     # --- Validation Arguments ---
@@ -1061,8 +1061,8 @@ def main(direct_mode_args={}):
 
     if hasattr(args, "enable_batch_deduplication"):
         config_module.ENABLE_BATCH_DEDUPLICATION = args.enable_batch_deduplication
-    if hasattr(args, "maximum_zero_shot_topics"):
-        config_module.MAXIMUM_ZERO_SHOT_TOPICS = args.maximum_zero_shot_topics
+    if hasattr(args, "maximum_allowed_topics"):
+        config_module.MAXIMUM_ALLOWED_TOPICS = args.maximum_allowed_topics
 
     # Get username and folders
     (
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "llm_topic_modelling"
-version = "0.8.0"
+version = "0.9.0"
 description = "Generate thematic summaries from open text in tabular data files with a large language model."
 requires-python = ">=3.10"
 readme = "README.md"
@@ -51,7 +51,7 @@ classifiers = [
 ]
 
 dependencies = [    
-    "gradio==6.2.0",
+    "gradio==6.0.2",
     "transformers==4.57.2",
     "spaces==0.42.1",
     "boto3==1.42.1",
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 # Note that this requirements file is optimised for Hugging Face spaces / Python 3.10. Please use requirements_no_local.txt for installation without local model inference (simplest approach to get going). Please use requirements_cpu.txt for CPU instances and requirements_gpu.txt for GPU instances using Python 3.11
-gradio==6.2.0
+gradio==6.0.2
 transformers==4.57.2
 spaces==0.42.1
 boto3>=1.42.1
diff --git a/requirements_cpu.txt b/requirements_cpu.txt
@@ -1,4 +1,4 @@
-gradio==6.2.0
+gradio==6.0.2
 transformers==4.57.2
 spaces==0.42.1
 pandas>=2.3.3
diff --git a/requirements_gpu.txt b/requirements_gpu.txt
@@ -1,4 +1,4 @@
-gradio==6.2.0
+gradio==6.0.2
 transformers==4.57.2
 spaces==0.42.1
 boto3>=1.42.1
diff --git a/requirements_lightweight.txt b/requirements_lightweight.txt
@@ -1,5 +1,5 @@
 # This requirements file is optimised for AWS ECS using Python 3.11 alongside the Dockerfile, without local torch and llama-cpp-python. For AWS ECS, torch and llama-cpp-python are optionally installed in the main Dockerfile
-gradio==6.2.0
+gradio==6.0.2
 transformers==4.57.2
 spaces==0.42.1
 boto3>=1.42.1
diff --git a/test/mock_inference_server.py b/test/mock_inference_server.py
@@ -32,11 +32,11 @@ def _generate_mock_response(self, prompt: str, system_prompt: str) -> str:
         """
         # Generate a simple markdown table that satisfies the validation
         # This mimics a topic extraction table response
-        mock_table = """| Reference | General Topic | Sub-topic | Sentiment |
-|-----------|---------------|-----------|-----------|
-| 1 | Test Topic | Test Subtopic | Positive |
-| 2 | Another Topic | Another Subtopic | Neutral |
-| 3 | Third Topic | Third Subtopic | Negative |
+        mock_table = """| General topic | Subtopic | Sentiment | Response References | Summary |
+|-----------|---------------|-----------|-----------|-----------|
+| Test Topic | Test Subtopic | Positive | 1 | Test summary |
+| Another Topic | Another Subtopic | Neutral | 2,3 | Another summary |
+| Third Topic | Third Subtopic | Negative | 1, 2, 3 | Third summary |
 
 This is a mock response from the test inference server. The actual content would be generated by a real LLM model, but for testing purposes, this dummy response allows us to verify that the CLI commands work correctly without incurring API costs."""
 
diff --git a/test/mock_llm_calls.py b/test/mock_llm_calls.py
@@ -30,11 +30,11 @@ def _generate_mock_response(prompt: str, system_prompt: str) -> str:
     """
     # Generate a simple markdown table that satisfies the validation
     # This mimics a topic extraction table response
-    mock_table = """| Reference | General Topic | Sub-topic | Sentiment |
-|-----------|---------------|-----------|-----------|
-| 1 | Test Topic | Test Subtopic | Positive |
-| 2 | Another Topic | Another Subtopic | Neutral |
-| 3 | Third Topic | Third Subtopic | Negative |
+    mock_table = """| General topic | Subtopic | Sentiment | Response References | Summary |
+|-----------|---------------|-----------|-----------|-----------|
+| Test Topic | Test Subtopic | Positive | 1 | Test summary |
+| Another Topic | Another Subtopic | Neutral | 2,3 | Another summary |
+| Third Topic | Third Subtopic | Negative | 1, 2, 3 | Third summary |
 
 This is a mock response from the test inference server. The actual content would be generated by a real LLM model, but for testing purposes, this dummy response allows us to verify that the CLI commands work correctly without incurring API costs."""
 
diff --git a/tools/aws_functions.py b/tools/aws_functions.py
@@ -381,3 +381,24 @@ def export_outputs_to_s3(
 
     # No GUI outputs to update
     return
+
+
+def download_cost_codes_with_error_handling(bucket, key, local_path):
+    """
+    Wrapper function with error handling for downloading cost codes from S3.
+
+    Args:
+        bucket: S3 bucket name
+        key: S3 object key
+        local_path: Local file path to save the downloaded file
+
+    Returns:
+        bool: True if download successful, False otherwise
+    """
+    try:
+        download_file_from_s3(bucket, key, local_path)
+        return True
+    except Exception as e:
+        print(f"Error downloading cost codes from S3: {e}")
+        print(f"Failed to download s3://{bucket}/{key}")
+        return False
diff --git a/tools/config.py b/tools/config.py
@@ -321,8 +321,8 @@ def convert_string_to_boolean(value: str) -> bool:
 BATCH_SIZE_DEFAULT = int(
     get_or_create_env_var("BATCH_SIZE_DEFAULT", "5")
 )  # Default batch size for LLM calls
-MAXIMUM_ZERO_SHOT_TOPICS = int(
-    get_or_create_env_var("MAXIMUM_ZERO_SHOT_TOPICS", "100")
+MAXIMUM_ALLOWED_TOPICS = int(
+    get_or_create_env_var("MAXIMUM_ALLOWED_TOPICS", "100")
 )  # Maximum number of zero shot topics to process
 MAX_SPACES_GPU_RUN_TIME = int(
     get_or_create_env_var("MAX_SPACES_GPU_RUN_TIME", "240")
@@ -333,8 +333,8 @@ def convert_string_to_boolean(value: str) -> bool:
 )  # Deduplication threshold for topic summary tables
 
 ENABLE_BATCH_DEDUPLICATION = convert_string_to_boolean(
-    get_or_create_env_var("ENABLE_BATCH_DEDUPLICATION", "False")
-)  # Whether to deduplicate topics after each batch during extraction
+    get_or_create_env_var("ENABLE_BATCH_DEDUPLICATION", "True")
+)  # Whether to deduplicate topics after each batch during extraction. Will use basic deduplication to check for typos in effectively duplicate topic names, and if candidate topics are not provided, will use LLM deduplication to merge similar topics if the current number of topics exceeds the maximum allowed number of topics (MAXIMUM_ALLOWED_TOPICS, or defined in GUI by user)
 
 ###
 # Model options
diff --git a/tools/dedup_summaries.py b/tools/dedup_summaries.py
diff --git a/tools/helper_functions.py b/tools/helper_functions.py
diff --git a/tools/llm_api_call.py b/tools/llm_api_call.py
diff --git a/tools/llm_funcs.py b/tools/llm_funcs.py
diff --git a/tools/prompts.py b/tools/prompts.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-gradio==6.2.0`
	`1`	`+gradio==6.0.2`
`2`	`2`	`transformers==4.57.2`
`3`	`3`	`spaces==0.42.1`
`4`	`4`	`pandas>=2.3.3`