diff --git a/ai/generative-ai-service/sentiment+categorization/LICENSE b/ai/generative-ai-service/sentiment+categorization/LICENSE new file mode 100644 index 000000000..8dc7c0703 --- /dev/null +++ b/ai/generative-ai-service/sentiment+categorization/LICENSE @@ -0,0 +1,35 @@ +Copyright (c) 2025 Oracle and/or its affiliates. + +The Universal Permissive License (UPL), Version 1.0 + +Subject to the condition set forth below, permission is hereby granted to any +person obtaining a copy of this software, associated documentation and/or data +(collectively the "Software"), free of charge and under any and all copyright +rights in the Software, and any and all patent rights owned or freely +licensable by each licensor hereunder covering either (i) the unmodified +Software as contributed to or provided by such licensor, or (ii) the Larger +Works (as defined below), to deal in both + +(a) the Software, and +(b) any piece of software and/or hardware listed in the lrgrwrks.txt file if +one is included with the Software (each a "Larger Work" to which the Software +is contributed by such licensors), + +without restriction, including without limitation the rights to copy, create +derivative works of, display, perform, and distribute the Software and make, +use, sell, offer for sale, import, export, have made, and have sold the +Software and the Larger Work(s), and to sublicense the foregoing rights on +either these or other terms. + +This license is subject to the following condition: +The above copyright notice and either this complete permission notice or at +a minimum a reference to the UPL must be included in all copies or +substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/ai/generative-ai-service/sentiment+categorization/README.md b/ai/generative-ai-service/sentiment+categorization/README.md new file mode 100644 index 000000000..6a1f8351e --- /dev/null +++ b/ai/generative-ai-service/sentiment+categorization/README.md @@ -0,0 +1,32 @@ +# Customer Message Analyzer + +The Customer Message Analyzer is a tool designed to analyze customer messages through unsupervised categorization, sentiment analysis, and summary reporting. It helps businesses understand customer feedback without requiring extensive manual labeling or analysis. + + +Reviewed: 01.04.2025 + +# When to use this asset? + +Customer service teams, product managers, and marketing professionals would use this asset when they need to quickly understand large volumes of customer feedback, identify trends, and make data-driven decisions to improve products or services. + +# How to use this asset? + +To use the Customer Message Analyzer, follow these steps: + +1. Input the customer messages into the system. +2. The system will automatically cluster the messages into categories based on their content. +3. Each message will receive a sentiment score indicating its emotional tone. +4. Review the generated summary report highlighting dominant themes, sentiment trends, and actionable insights. + +# Useful Links (Optional) + +- [Confluence](https://confluence.oraclecorp.com/confluence/x/DaCEoAE) + - Internal Reusable Assets + +# License + +Copyright (c) 2025 Oracle and/or its affiliates. + +Licensed under the Universal Permissive License (UPL), Version 1.0. + +See [LICENSE](https://github.com/oracle-devrel/technology-engineering/blob/main/LICENSE) for more details. diff --git a/ai/generative-ai-service/sentiment+categorization/files/README.md b/ai/generative-ai-service/sentiment+categorization/files/README.md new file mode 100644 index 000000000..37df10d65 --- /dev/null +++ b/ai/generative-ai-service/sentiment+categorization/files/README.md @@ -0,0 +1,54 @@ +# Batch Message Analysis and Categorization Demo +This demo showcases an AI-powered solution for analyzing batches of customer messages, categorizing them into hierarchical levels, extracting sentiment scores, and generating structured reports. + +## Key Features +* **Hierarchical Categorization**: Automatically categorizes messages into three levels of hierarchy: + + Primary Category: High-level categorization + + Secondary Category: Mid-level categorization, building upon primary categories + + Tertiary Category: Low-level categorization, providing increased specificity and detail +* **Sentiment Analysis**: Extracts sentiment scores for each message, ranging from very negative (1) to very positive (10) +* **Structured Reporting**: Generates a comprehensive report analyzing the batch of messages, including: + + Category distribution across all three levels + + Sentiment score distribution + + Summaries of key findings and insights + +## Data Requirements +* Customer messages should be stored in a CSV file(s) within a folder named `data`. +* Each CSV file should contain a column with the message text. + +## Getting Started +To run the demo, follow these steps: +1. Clone the repository using `git clone`. +2. Place your CSV files containing customer messages in the `data` folder. +3. Install dependencies using `pip install -r requirements.txt`. +4. Run the application using `streamlit run app.py`. + +## Example Use Cases +* Analyze customer feedback from surveys, reviews, or social media platforms to identify trends and patterns. +* Inform product development and customer support strategies by understanding customer sentiment and preferences. +* Optimize marketing campaigns by targeting specific customer segments based on their interests and concerns. + +## Technical Details +* The solution leverages Oracle Cloud Infrastructure (OCI) GenAI, a suite of AI services designed to simplify AI adoption. +* Specifically, this demo utilizes the Cohere R+ model, a state-of-the-art language model optimized for natural language processing tasks. +* All aspects of the demo, including: + + Hierarchical categorization + + Sentiment analysis + + Structured report generation +are powered by GenAI, ensuring accurate and efficient analysis of customer messages. + +## Output +The demo will display an interactive dashboard with the generated report, providing valuable insights into customer messages, including: +* Category distribution across all three levels +* Sentiment score distribution +* Summaries of key findings and insights + +## Contributing +We welcome contributions to improve and expand the capabilities of this demo. Please fork the repository and submit a pull request with your changes. + +## License +Copyright (c) 2025 Oracle and/or its affiliates. + +Licensed under the Universal Permissive License (UPL), Version 1.0. + +See [LICENSE](https://github.com/oracle-devrel/technology-engineering/blob/main/LICENSE) for more details. \ No newline at end of file diff --git a/ai/generative-ai-service/sentiment+categorization/files/app.py b/ai/generative-ai-service/sentiment+categorization/files/app.py new file mode 100644 index 000000000..41146f11e --- /dev/null +++ b/ai/generative-ai-service/sentiment+categorization/files/app.py @@ -0,0 +1,16 @@ +import streamlit as st + +st.set_page_config( + page_title="Hello", + page_icon="πŸ‘‹", +) + +st.write("# Welcome to Streamlit! πŸ‘‹") + +st.sidebar.success("Select a demo above.") + +st.markdown( + """ +This is a demo! +""" +) \ No newline at end of file diff --git a/ai/generative-ai-service/sentiment+categorization/files/backend/data/complaints_messages.csv b/ai/generative-ai-service/sentiment+categorization/files/backend/data/complaints_messages.csv new file mode 100644 index 000000000..b1e97be02 --- /dev/null +++ b/ai/generative-ai-service/sentiment+categorization/files/backend/data/complaints_messages.csv @@ -0,0 +1,31 @@ +ID,Message +1,I had to cancel my order because of poor service. +2,"The delivery was late, and the packaging was damaged." +3,I was sent the wrong color of the product. +4,My order was incomplete when it arrived. +5,The product I received was damaged. +6,The quality of the product is much worse than expected. +7,The product stopped working after a short period of time. +8,The product doesn’t match the description on the website. +9,I’ve had to contact customer service multiple times for the same issue. +10,Customer support was not helpful at all. +11,The quality of the product was poor. +12,The product was much smaller than I expected. +13,I had trouble finding the product on your website. +14,The instructions were unclear and hard to follow. +15,The website was difficult to navigate during my purchase. +16,I received the wrong size and need a replacement. +17,I was given false information about the product. +18,The product stopped working after a short period of time. +19,The product arrived damaged and unusable. +20,The product arrived in terrible condition. +21,The product arrived damaged and unusable. +22,The customer service was slow to respond. +23,The product was missing some essential accessories. +24,I didn’t receive any confirmation email for my order. +25,The product wasn’t compatible with my other appliances. +26,The product is faulty and doesn’t work properly. +27,The product didn’t fit as expected. +28,The product was extremely hard to set up. +29,I am unhappy with the design of the product. +30,The website was difficult to navigate during my purchase. \ No newline at end of file diff --git a/ai/generative-ai-service/sentiment+categorization/files/backend/feedback_agent.py b/ai/generative-ai-service/sentiment+categorization/files/backend/feedback_agent.py new file mode 100644 index 000000000..837510380 --- /dev/null +++ b/ai/generative-ai-service/sentiment+categorization/files/backend/feedback_agent.py @@ -0,0 +1,129 @@ +import json +import logging +from typing import List + +from langchain_community.chat_models.oci_generative_ai import ChatOCIGenAI +from langchain_core.messages import HumanMessage, SystemMessage +from langchain_core.pydantic_v1 import BaseModel +from langgraph.checkpoint.memory import MemorySaver +from langgraph.graph import END, StateGraph + +import backend.message_handler as handler +import backend.utils.llm_config as llm_config + +# Set up logging +logging.getLogger("oci").setLevel(logging.DEBUG) +messages_path = "ai/generative-ai-service/sentiment+categorization/demo_code/backend/data/complaints_messages.csv" + + +class AgentState(BaseModel): + messages_info: List = [] + categories: List = [] + reports: List = [] + + +class FeedbackAgent: + def __init__(self, model_name: str = "cohere_oci"): + self.model_name = model_name + self.model = self.initialize_model() + self.memory = MemorySaver() + self.builder = self.setup_graph() + self.messages = self.read_messages() + + def initialize_model(self): + if self.model_name not in llm_config.MODEL_REGISTRY: + raise ValueError(f"Unknown model: {self.model_name}") + + model_config = llm_config.MODEL_REGISTRY[self.model_name] + + return ChatOCIGenAI( + model_id=model_config["model_id"], + service_endpoint=model_config["service_endpoint"], + compartment_id=model_config["compartment_id"], + provider=model_config["provider"], + auth_type=model_config["auth_type"], + auth_profile=model_config["auth_profile"], + model_kwargs=model_config["model_kwargs"], + ) + + def read_messages(self): + messages = handler.read_messages(filepath=messages_path) + return handler.batchify(messages, 30) + + def summarization_node(self, state: AgentState): + batch = self.messages + response = self.model.invoke( + [ + SystemMessage( + content=llm_config.get_prompt(self.model_name, "SUMMARIZATION") + ), + HumanMessage(content=f"Message batch: {batch}"), + ] + ) + state.messages_info = state.messages_info + [json.loads(response.content)] + return {"messages_info": state.messages_info} + + def categorization_node(self, state: AgentState): + batch = state.messages_info + response = self.model.invoke( + [ + SystemMessage( + content=llm_config.get_prompt( + self.model_name, "CATEGORIZATION_SYSTEM" + ) + ), + HumanMessage( + content=llm_config.get_prompt( + self.model_name, "CATEGORIZATION_USER" + ).format(MESSAGE_BATCH=batch) + ), + ] + ) + content = [json.loads(response.content)] + state.categories = state.categories + handler.match_categories(batch, content) + return {"categories": state.categories} + + def generate_report_node(self, state: AgentState): + response = self.model.invoke( + [ + SystemMessage( + content=llm_config.get_prompt(self.model_name, "REPORT_GEN") + ), + HumanMessage(content=f"Message info: {state.categories}"), + ] + ) + state.reports = response.content + return {"reports": [response.content]} + + def setup_graph(self): + builder = StateGraph(AgentState) + builder.add_node("summarize", self.summarization_node) + builder.add_node("categorize", self.categorization_node) + builder.add_node("generate_report", self.generate_report_node) + + builder.set_entry_point("summarize") + builder.add_edge("summarize", "categorize") + builder.add_edge("categorize", "generate_report") + + builder.add_edge("generate_report", END) + return builder.compile(checkpointer=self.memory) + + def get_graph(self): + return self.builder.get_graph() + + def run(self): + thread = {"configurable": {"thread_id": "1"}} + for s in self.builder.stream( + config=thread, + ): + print(f"\n \n{s}") + + def run_step_by_step(self): + thread = {"configurable": {"thread_id": "1"}} + initial_state = { + "messages_info": [], + "categories": [], + "reports": [], + } + for state in self.builder.stream(initial_state, thread): + yield state # Yield each intermediate step to allow step-by-step execution diff --git a/ai/generative-ai-service/sentiment+categorization/files/backend/feedback_wrapper.py b/ai/generative-ai-service/sentiment+categorization/files/backend/feedback_wrapper.py new file mode 100644 index 000000000..939e89ec3 --- /dev/null +++ b/ai/generative-ai-service/sentiment+categorization/files/backend/feedback_wrapper.py @@ -0,0 +1,25 @@ +from backend.feedback_agent import FeedbackAgent + + +class FeedbackAgentWrapper: + def __init__(self): + self.agent = FeedbackAgent() + self.run_graph = self.agent.run_step_by_step() + + def get_nodes_edges(self): + graph_data = self.agent.get_graph() + nodes = list(graph_data.nodes.keys()) + edges = [(edge.source, edge.target) for edge in graph_data.edges] + return nodes, edges + + def run_step_by_step(self): + try: + action_output = next(self.run_graph) + current_node = list(action_output.keys())[0] + except StopIteration: + action_output = {} + current_node = "FINALIZED" + return current_node, action_output + + def get_graph(self): + return self.agent.get_graph() diff --git a/ai/generative-ai-service/sentiment+categorization/files/backend/message_handler.py b/ai/generative-ai-service/sentiment+categorization/files/backend/message_handler.py new file mode 100644 index 000000000..7109468ea --- /dev/null +++ b/ai/generative-ai-service/sentiment+categorization/files/backend/message_handler.py @@ -0,0 +1,53 @@ +import csv +from typing import List + + +def read_messages( + filepath: str, columns: List[str] = ["ID", "Message"] +) -> List[List[str]]: + with open(filepath, newline="", encoding="utf-8") as file: + reader = csv.DictReader(file) + extracted_data = [] + + for row in reader: + extracted_row = [row[col] for col in columns if col in row] + extracted_data.append(extracted_row) + + return extracted_data + + +def batchify(lst, batch_size): + return [lst[i : i + batch_size] for i in range(0, len(lst), batch_size)] + + +def match_categories(summaries, categories): + result = [] + for i, elem in enumerate(summaries[0]): + if elem["id"] == categories[0][i]["id"]: + elem["primary_category"] = categories[0][i]["primary_category"] + elem["secondary_category"] = categories[0][i]["secondary_category"] + elem["tertiary_category"] = categories[0][i]["tertiary_category"] + result.append(elem) + return result + + +def group_by_category_level(categories_list): + result = {} + + for category in categories_list: + primary = category["primary_category"] + secondary = category["secondary_category"] + tertiary = category["tertiary_category"] + + if primary not in result: + result[primary] = {} + + if secondary not in result[primary]: + result[primary][secondary] = {} + + if tertiary not in result[primary][secondary]: + result[primary][secondary][tertiary] = [] + + result[primary][secondary][tertiary].append(category["id"]) + + return result diff --git a/ai/generative-ai-service/sentiment+categorization/files/backend/utils/config.py b/ai/generative-ai-service/sentiment+categorization/files/backend/utils/config.py new file mode 100644 index 000000000..8642619d0 --- /dev/null +++ b/ai/generative-ai-service/sentiment+categorization/files/backend/utils/config.py @@ -0,0 +1,15 @@ + +# Common Configuration +COMPARTMENT_ID = "ocid1.compartment.oc1..XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" # add here your compartment ocid id +DIRECTORY = "data" +AUTH_TYPE = "API_KEY" # add here your authentication type +CONFIG_PROFILE = "DEFAULT" +ENDPOINT = "" # add here your generative ai endpoint + +# COHERE data +PROVIDER_COHERE = "cohere" +GENERATE_MODEL_COHERE = "cohere.command-r-plus-08-2024" # add here your model ocid id +# LLAMA data +PROVIDER_LLAMA = "meta" +GENERATE_MODEL_LLAMA_33= "meta.llama-3.3-70b-instruct" # add here your model ocid id + diff --git a/ai/generative-ai-service/sentiment+categorization/files/backend/utils/llm_config.py b/ai/generative-ai-service/sentiment+categorization/files/backend/utils/llm_config.py new file mode 100644 index 000000000..1d8939ab3 --- /dev/null +++ b/ai/generative-ai-service/sentiment+categorization/files/backend/utils/llm_config.py @@ -0,0 +1,49 @@ +# import utils.config as config +# from utils import prompts as prompts + +import backend.utils.config as config +from backend.utils import prompts as prompts + + +def get_prompt(model_name: str, prompt_type: str) -> str: + if model_name not in PROMPT_SETS: + raise ValueError(f"No prompts defined for model {model_name}") + if prompt_type not in PROMPT_SETS[model_name]: + raise ValueError(f"Unknown prompt type: {prompt_type}") + return PROMPT_SETS[model_name][prompt_type] + + +MODEL_REGISTRY = { + "cohere_oci": { + "model_id": config.GENERATE_MODEL_COHERE, + "service_endpoint": config.ENDPOINT, + "compartment_id": config.COMPARTMENT_ID, + "provider": config.PROVIDER_COHERE, + "auth_type": config.AUTH_TYPE, + "auth_profile": config.CONFIG_PROFILE, + "model_kwargs": {"temperature": 0, "max_tokens": 4000}, + }, + "meta_oci": { + "model_id": config.GENERATE_MODEL_LLAMA_33, + "service_endpoint": config.ENDPOINT, + "compartment_id": config.COMPARTMENT_ID, + "provider": config.PROVIDER_LLAMA, + "auth_type": config.AUTH_TYPE, + "auth_profile": config.CONFIG_PROFILE, + "model_kwargs": {"temperature": 0, "max_tokens": 2000}, + }, +} + +PROMPT_SETS = { + "cohere_oci": { + "SUMMARIZATION": prompts.SUMMARIZATION, + "CATEGORIZATION_SYSTEM": prompts.CATEGORIZATION_SYSTEM, + "CATEGORIZATION_USER": prompts.CATEGORIZATION_USER, + "REPORT_GEN": prompts.REPORT_GEN, + }, + "meta_oci": { + "SUMMARIZATION_LLAMA": prompts.SUMMARIZATION_LLAMA, + "CATEGORIZATION_LLAMA": prompts.CATEGORIZATION_LLAMA, + "REPORT_GEN_LLAMA": prompts.REPORT_GEN_LLAMA, + }, +} diff --git a/ai/generative-ai-service/sentiment+categorization/files/backend/utils/prompts.py b/ai/generative-ai-service/sentiment+categorization/files/backend/utils/prompts.py new file mode 100644 index 000000000..0810917b3 --- /dev/null +++ b/ai/generative-ai-service/sentiment+categorization/files/backend/utils/prompts.py @@ -0,0 +1,273 @@ + +SUMMARIZATION = """\ +Analyze the following batch of user messages. For each message, identify the main topic, provide a concise summary, and assess the sentiment on a scale from 1 (negative) to 10 (positive). Return your analysis as an array of JSON objects, with each object containing the message ID and the following fields: "topic", "summary", and "sentiment_score". + +**Provide your analysis in this exact JSON format:** +[ + { + "id": "msg_001", + "topic": "The primary subject matter of the first message", + "summary": "A concise 1-2 sentence summary of the key points", + "sentiment_score": X + }, + { + "id": "msg_002", + "topic": "The primary subject matter of the second message", + "summary": "A concise 1-2 sentence summary of the key points", + "sentiment_score": X + }, + { + "id": "msg_003", + "topic": "The primary subject matter of the third message", + "summary": "A concise 1-2 sentence summary of the key points", + "sentiment_score": X + } +] + +**Before finalizing your response:** +1. Ensure you've accurately identified the central topic for each message +2. Verify your summaries capture the essential information in 1-2 sentences +3. Confirm your sentiment scores reflect the emotional tone (1=extremely negative, 10=extremely positive) +4. Check that your output is valid, parseable JSON +5. Make sure each message ID in your response matches the corresponding message ID from the input +\ +""" + +CATEGORIZATION_SYSTEM = """\ +You are an expert content analyzer that categorizes user messages into a hierarchical taxonomy. You process batches of message summaries, each with a unique ID, and classify them into a 3-level nested category structure. Your categorization must follow a consistent taxonomy with increasing specificity at each level. + +IMPORTANT CONSTRAINTS: +- Use no more than 5 primary categories +- Use no more than 5-6 secondary categories per primary category +- Use no more than 10 tertiary categories TOTAL across the entire taxonomy +- Maintain consistent naming and logical hierarchical relationships +- Reuse existing categories instead of creating new ones whenever possible + +Analyze the following batch of message summaries and categorize each one into this constrained 3-level hierarchical taxonomy: + +1. Primary category: Broadest level (e.g., "Human Resources", "Finance", "IT") +2. Secondary category: More specific domain within the primary category (e.g., "Policies", "Benefits", "Recruitment") +3. Tertiary category: Most specific classification (e.g., "Leaves", "Health Insurance", "Interview Process") + +Ensure that each level builds upon the previous level with increasing specificity and follows logical parent-child relationships. + +Return your analysis as a JSON array with each element containing "id", "primary_category", "secondary_category", and "tertiary_category". + +**Example of expected categorization structure:** +- Primary: "Human Resources" + - Secondary: "Policies" + - Tertiary: "Leaves" + - Secondary: "Benefits" + - Tertiary: "Health Insurance" + +\ +""" + +CATEGORIZATION_USER = """\ + +Message Batch: +{MESSAGE_BATCH} +\ +""" + +REPORT_GEN = """\ +# **Generate a Categorized JSON Report of Messages** + +## **Task** +You are an expert data analyst. Your task is to generate a structured and insightful JSON report analyzing a batch of messages based on their category levels, summaries, and sentiment scores. Each message is classified into three hierarchical category levels (Category Level 2 and Category Level 3). + +## **Instructions** +Follow these steps to generate the report: +1. **Group messages by Category Level 2**, then further organize them into Category Level 3 subcategories. +2. **Summarize each category level** by aggregating key themes and trends from message summaries. +3. **Analyze sentiment scores** by calculating the average, highest, and lowest sentiment scores for each category level (1=extremely negative, 10=extremely positive). +4. **Identify key insights** such as common concerns, prevailing sentiment trends, and outliers. +5. **Format the output as a structured JSON object** with the following structure: + +## **Output Format (JSON)** +{ + "categories": [ + { + "category_level_1": "Category Name", + "summary": "Concise summary of messages in this category", + "average_sentiment_score": 7.2, + "highest_sentiment_message": { + "summary": "Message summary", + "sentiment_score": 10 + }, + "lowest_sentiment_message": { + "summary": "Message summary", + "sentiment_score": 2 + }, + "key_insights": [ + "Notable pattern 1", + "Notable pattern 2" + ], + "subcategories": [ + { + "category_level_2": "Subcategory Name", + "summary": "Common themes from messages", + "average_sentiment_score": 6.5, + "sentiment_range": { + "lowest": 3, + "highest": 9 + }, + "notable_sentiment_trends": [ + "Trend 1", + "Trend 2" + ], + } + ] + } + ] +} +\ +""" + + + +# (LLAMA format) COMPETITOR REPORT PROMPTS +SUMMARIZATION_LLAMA = """ +You are an AI assistant that analyzes text and provides structured analysis. Your task is to examine the user's message, identify the main topic, create a concise summary, and evaluate the sentiment on a scale from 1 (negative) to 10 (positive). You must return your analysis in valid JSON format. + + + +Analyze the following message and provide your analysis in valid JSON format with these fields: "main_topic", "summary", and "sentiment_score" (where 1 is extremely negative and 10 is extremely positive). + +Message: {USER_MESSAGE} + + + +I'll analyze this message carefully. + +{ + "main_topic": "The primary subject matter of the message", + "summary": "A concise 1-2 sentence summary of the key points", + "sentiment_score": X +} + +Let me verify: +- The main topic accurately represents the central theme +- The summary captures the essential information in 1-2 sentences +- The sentiment score correctly reflects the emotional tone on a 1-10 scale +- The output is valid, parseable JSON + +""" + +CATEGORIZATION_LLAMA = """ +You are an expert content analyzer that categorizes user messages into a hierarchical taxonomy. You process batches of message summaries, each with a unique ID, and classify them into a 3-level nested category structure. Your categorization must follow a consistent taxonomy with increasing specificity at each level. + + + +Analyze the following batch of message summaries and categorize each one into a 3-level hierarchical taxonomy: + +1. Primary category: Broadest level (e.g., "Human Resources", "Finance", "IT") +2. Secondary category: More specific domain within the primary category (e.g., "Policies", "Benefits", "Recruitment") +3. Tertiary category: Most specific classification (e.g., "Leaves", "Health Insurance", "Interview Process") + +Ensure that each level builds upon the previous level with increasing specificity and follows logical parent-child relationships. + +Return your analysis as a JSON array with each element containing "id", "primary_category", "secondary_category", and "tertiary_category". + +Example of expected categorization structure: +- Primary: "Human Resources" + - Secondary: "Policies" + - Tertiary: "Leaves" + - Secondary: "Benefits" + - Tertiary: "Health Insurance" + +Message Batch: +{MESSAGE_BATCH} + + + +I've analyzed each message summary and categorized them according to the 3-level hierarchical taxonomy: + +```json +[ + { + "id": "msg_001", + "primary_category": "Primary level category", + "secondary_category": "Secondary level category within primary", + "tertiary_category": "Tertiary level category within secondary" + }, + { + "id": "msg_002", + "primary_category": "Primary level category", + "secondary_category": "Secondary level category within primary", + "tertiary_category": "Tertiary level category within secondary" + }, + { + "id": "msg_003", + "primary_category": "Primary level category", + "secondary_category": "Secondary level category within primary", + "tertiary_category": "Tertiary level category within secondary" + } +] + +""" + +REPORT_GEN_LLAMA = """<|begin_of_text|> +<|start_header_id|>system<|end_header_id|> + +You are an expert data analyst. Your task is to generate a structured and insightful JSON report analyzing a batch of messages based on their category levels, summaries, and sentiment scores. Each message is classified into three hierarchical category levels (Category Level 1, Category Level 2, and Category Level 3). + +Follow these steps to generate the report: +1. **Group messages by Category Level 1**, then further organize them into Category Level 2 and Category Level 3 subcategories. +2. **Summarize each category level** by aggregating key themes and trends from message summaries. +3. **Analyze sentiment scores** by calculating the average, highest, and lowest sentiment scores for each category level. +4. **Identify key insights** such as common concerns, prevailing sentiment trends, and outliers. +5. **Format the output as a structured JSON object** with the following structure: + +```json +{ + "categories": [ + { + "category_level_1": "Category Name", + "summary": "Concise summary of messages in this category", + "average_sentiment_score": 7.2, + "highest_sentiment_message": { + "summary": "Message summary", + "sentiment_score": 10 + }, + "lowest_sentiment_message": { + "summary": "Message summary", + "sentiment_score": 2 + }, + "key_insights": [ + "Notable pattern 1", + "Notable pattern 2" + ], + "subcategories": [ + { + "category_level_2": "Subcategory Name", + "summary": "Common themes from messages", + "average_sentiment_score": 6.5, + "sentiment_range": { + "lowest": 3, + "highest": 9 + }, + "notable_sentiment_trends": [ + "Trend 1", + "Trend 2" + ], + "subcategories": [ + { + "category_level_3": "Sub-subcategory Name", + "summary": "Detailed message patterns", + "average_sentiment_score": 5.8, + "sentiment_range": { + "lowest": 1, + "highest": 8 + }, + "notable_sentiment_trends": [ + "Specific insight based on sentiment analysis" + ] + } + ] + } + ] + } + ] +} +""" diff --git a/ai/generative-ai-service/sentiment+categorization/files/pages/SentimentByCat.py b/ai/generative-ai-service/sentiment+categorization/files/pages/SentimentByCat.py new file mode 100644 index 000000000..48e8fa00c --- /dev/null +++ b/ai/generative-ai-service/sentiment+categorization/files/pages/SentimentByCat.py @@ -0,0 +1,212 @@ +import json + +import pandas as pd +import streamlit as st +import plotly.express as px + +from backend.feedback_wrapper import FeedbackAgentWrapper + + +# Function to generate Graphviz flowchart with rounded boxes +def create_flowchart(nodes, edges, highlight_node=None, highlight_edge=None, label=""): + dot = "digraph G {\n" + fontname = "Courier New" + dot += f' graph [fontname="{fontname}"];\n' + + # Define nodes with rounded style + for step in nodes: + node_color = "paleturquoise" if step != highlight_node else "gold" + dot += f' "{step}" [shape=box, style="rounded,filled", fillcolor={node_color}, color=dimgrey, fontname="{fontname}", fontsize=12, fontcolor=dimgrey];\n' + + # Define edges with conditional highlighting + for source, target in edges: + edge_color = "dimgrey" if (source, target) != highlight_edge else "gold" + dot += f' "{source}" -> "{target}" [label="{label}", color={edge_color}, penwidth=1.5, arrowsize=0.7];\n' + + dot += "}\n" + return dot + + +def find_result(data): + target_key = "reports" + for value_list in data.values(): # All lists + for subdict in value_list: # All dictionaries inside lists + if isinstance(subdict, dict): # Ensure it is a dict + for subvalue in subdict.values(): # Look for internal values + if isinstance(subvalue, dict) and target_key in subvalue: + return subvalue[target_key] + return None + + +# Function to execute the flow dynamically +def execute_flow(col1, col2): + competitor_report = FeedbackAgentWrapper() + steps, edges = competitor_report.get_nodes_edges() + + i = 0 + current_step = steps[0] + + step_outputs = {step: [{}] for step in steps} + + # Store active step for highlight + if "active_step" not in st.session_state: + st.session_state.active_step = steps[0] + + with col1: + flowchart_placeholder = st.empty() + flowchart_dot = create_flowchart( + steps, edges, highlight_node=current_step, highlight_edge=edges[0] + ) + flowchart_placeholder.graphviz_chart(flowchart_dot) + + with col2: + tab_objects = st.tabs(steps[1:-1]) + + while current_step != steps[-1]: + with col1: + with st.spinner("Wait for it...", show_time=True): + next_step, output = competitor_report.run_step_by_step() + update_tab = True + if not output: + next_step = steps[-1] + update_tab = False + else: + # Store the output in the dictionary + step_outputs[current_step].append(output) + + if update_tab: + idx = steps.index(next_step) - 1 + with col2: + with tab_objects[idx]: + my_tab = st.container(height=550, border=False) + with my_tab: + st.json( + output, expanded=4 + ) # Display result inside respective tab + + # Determine the next step & edge + current_edge = (current_step, next_step) if next_step else None + + # Update flowchart highlighting current step + with col1: + flowchart_dot = create_flowchart( + steps, + edges, + highlight_node=next_step, + highlight_edge=current_edge, + ) + flowchart_placeholder.graphviz_chart(flowchart_dot) + + current_step = next_step + i += 1 + + st.session_state.flow_completed = True # Mark execution as completed + + return step_outputs + + +def display_category(data): + # Iterate over categories + for category in data["categories"]: + with st.container(): + st.subheader(f"πŸ“Œ {category['category_level_1']}") + st.write(category["summary"]) + + # Sentiment Scores + col1, col2, col3 = st.columns(3) + with col1: + st.metric( + "πŸ“Š Avg Sentiment Score", + category["average_sentiment_score"], + delta=None, + ) + st.progress( + category["average_sentiment_score"] / 5 + ) # Normalize to range 0-1 + with col2: + st.success( + f"πŸ’š Highest Sentiment: {category['highest_sentiment_message']['sentiment_score']}" + ) + st.write(f"**{category['highest_sentiment_message']['summary']}**") + with col3: + st.error( + f"❌ Lowest Sentiment: {category['lowest_sentiment_message']['sentiment_score']}" + ) + st.write(f"**{category['lowest_sentiment_message']['summary']}**") + + # Key Insights + st.markdown("### πŸ” Key Insights") + for insight in category["key_insights"]: + st.info(f"πŸ“Œ {insight}") + + # Subcategories + st.markdown("### πŸ“‚ Subcategories Breakdown") + for subcategory in category["subcategories"]: + with st.expander( + f"πŸ“Ž {subcategory['category_level_2']} (Avg Sentiment: {subcategory['average_sentiment_score']})" + ): + st.write(subcategory["summary"]) + + # Sentiment Range Bar + sentiment_df = pd.DataFrame( + { + "Sentiment Score": ["Lowest", "Highest"], + "Value": [ + subcategory["sentiment_range"]["lowest"], + subcategory["sentiment_range"]["highest"], + ], + } + ) + st.bar_chart(sentiment_df.set_index("Sentiment Score")) + + # Notable Trends + st.markdown("**πŸ“‰ Notable Sentiment Trends:**") + for trend in subcategory["notable_sentiment_trends"]: + st.warning(f"πŸ”» {trend}") + + +def display_sentiment(steps_data): + categorize_data = steps_data["summarize"] + + df = pd.DataFrame(categorize_data[1]["categorize"]["categories"]) + df.to_csv("test.csv", index=False) + + st.subheader("πŸ“Š Sentiment Analysis by Category") + fig = px.bar( + df, + x="id", + y="sentiment_score", + color="sentiment_score", + text="topic", + labels={"Message": "Id", "sentiment_score": "Sentiment Score (1-10)"}, + title="Sentiment Scores per Feedback Category", + ) + + fig.update_traces(textposition="inside") + st.plotly_chart(fig, use_container_width=True) + + +# Main layout +st.title("πŸ“Š Customer Feedback Dashboard") + +# Sidebar: Topic text input (disabled during execution) +if "flow_completed" not in st.session_state: + st.session_state.flow_completed = True # Start in ready state + +start_button = st.sidebar.button( + "Start!", type="primary", disabled=not st.session_state.flow_completed +) + +# Wait for user input before execution starts +if start_button and st.session_state.flow_completed: + col1, col2 = st.columns([0.4, 0.6], gap="medium") + + st.session_state.flow_completed = False # Lock input + step_outputs = execute_flow(col1, col2) + feedback_result = find_result(step_outputs) + + st.divider() + categories = json.loads(feedback_result[0]) + display_category(categories) + st.divider() + display_sentiment(step_outputs)