Skip to content

Commit 7eaf3e4

Browse files
Merge pull request #63 from seanpedrick-case/dev
Integration of llm deduplication into all in one pipeline (optional). Can now output only xlsx file in direct mode. Improved Excel file output formatting.
2 parents 87c25fe + dc71beb commit 7eaf3e4

File tree

8 files changed

+452
-120
lines changed

8 files changed

+452
-120
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ short_description: Create thematic summaries for open text data with LLMs
1313

1414
# Large language model topic modelling
1515

16-
Version: 0.6.0
16+
Version: 0.7.0
1717

1818
Extract topics and summarise outputs using Large Language Models (LLMs, Gemma 3 4b/GPT-OSS 20b if local (see tools/config.py to modify), Gemini, Azure, or AWS Bedrock models (e.g. Claude, Nova models). The app will query the LLM with batches of responses to produce summary tables, which are then compared iteratively to output a table with the general topics, subtopics, topic sentiment, and a topic summary. Instructions on use can be found in the README.md file. You can try out examples by clicking on one of the example datasets on the main app page, which will show you example outputs from a local model run. API keys for AWS, Azure, and Gemini services can be entered on the settings page (note that Gemini has a free public API).
1919

app.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1853,6 +1853,10 @@ def deduplicate_topics_llm_wrapper(
18531853
candidate_topics=None,
18541854
azure_endpoint="",
18551855
api_url=None,
1856+
aws_access_key_textbox="",
1857+
aws_secret_key_textbox="",
1858+
aws_region_textbox="",
1859+
azure_api_key_textbox="",
18561860
):
18571861
# Ensure custom model_choice is registered in model_name_map
18581862
ensure_model_in_map(model_choice)
@@ -1880,6 +1884,10 @@ def deduplicate_topics_llm_wrapper(
18801884
azure_endpoint,
18811885
OUTPUT_DEBUG_FILES,
18821886
api_url,
1887+
aws_access_key_textbox,
1888+
aws_secret_key_textbox,
1889+
aws_region_textbox,
1890+
azure_api_key_textbox,
18831891
)
18841892

18851893
deduplicate_llm_previous_data_btn.click(
@@ -1912,6 +1920,10 @@ def deduplicate_topics_llm_wrapper(
19121920
candidate_topics,
19131921
azure_endpoint_textbox,
19141922
api_url_textbox,
1923+
aws_access_key_textbox,
1924+
aws_secret_key_textbox,
1925+
aws_region_textbox,
1926+
azure_api_key_textbox,
19151927
],
19161928
outputs=[
19171929
master_reference_df_state,

cli_topics.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
DEDUPLICATION_THRESHOLD,
2626
DEFAULT_COST_CODE,
2727
DEFAULT_SAMPLED_SUMMARIES,
28+
DIRECT_MODE_S3_UPLOAD_ONLY_XLSX,
2829
DYNAMODB_USAGE_LOG_HEADERS,
2930
GEMINI_API_KEY,
3031
GRADIO_TEMP_DIR,
@@ -1393,7 +1394,6 @@ def main(direct_mode_args={}):
13931394
in_api_key=args.google_api_key,
13941395
temperature=args.temperature,
13951396
model_source=model_source,
1396-
bedrock_runtime=None,
13971397
local_model=None,
13981398
tokenizer=None,
13991399
assistant_model=None,
@@ -1409,6 +1409,19 @@ def main(direct_mode_args={}):
14091409
azure_endpoint=args.azure_endpoint,
14101410
output_debug_files=str(args.output_debug_files),
14111411
api_url=args.api_url if args.api_url else API_URL,
1412+
aws_access_key_textbox=(
1413+
args.aws_access_key if hasattr(args, "aws_access_key") else ""
1414+
),
1415+
aws_secret_key_textbox=(
1416+
args.aws_secret_key if hasattr(args, "aws_secret_key") else ""
1417+
),
1418+
aws_region_textbox=(
1419+
args.aws_region if hasattr(args, "aws_region") else ""
1420+
),
1421+
azure_api_key_textbox=(
1422+
args.azure_api_key if hasattr(args, "azure_api_key") else ""
1423+
),
1424+
model_name_map=model_name_map,
14121425
)
14131426

14141427
end_time = time.time()
@@ -1944,15 +1957,15 @@ def main(direct_mode_args={}):
19441957

19451958
# Upload outputs to S3 if enabled
19461959
# Collect all output files from the pipeline
1947-
all_output_files = []
1948-
if topic_extraction_output_files:
1949-
all_output_files.extend(topic_extraction_output_files)
1950-
if overall_summary_output_files:
1951-
all_output_files.extend(overall_summary_output_files)
1960+
all_s3_output_files = []
1961+
if topic_extraction_output_files and not DIRECT_MODE_S3_UPLOAD_ONLY_XLSX:
1962+
all_s3_output_files.extend(topic_extraction_output_files)
1963+
if overall_summary_output_files and not DIRECT_MODE_S3_UPLOAD_ONLY_XLSX:
1964+
all_s3_output_files.extend(overall_summary_output_files)
19521965
if xlsx_files:
1953-
all_output_files.extend(xlsx_files)
1966+
all_s3_output_files.extend(xlsx_files)
19541967
upload_outputs_to_s3_if_enabled(
1955-
output_files=all_output_files,
1968+
output_files=all_s3_output_files,
19561969
base_file_name=file_name,
19571970
session_hash=session_hash,
19581971
)

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "llm_topic_modelling"
7-
version = "0.6.0"
7+
version = "0.7.0"
88
description = "Generate thematic summaries from open text in tabular data files with a large language model."
99
requires-python = ">=3.10"
1010
readme = "README.md"

0 commit comments

Comments
 (0)