Update work on autogen

BenConstable9 · BenConstable9 · commit a3bd1d00cb1d · 2024-11-27T17:34:41.000Z
diff --git a/text_2_sql/autogen/README.md b/text_2_sql/autogen/README.md
@@ -2,6 +2,8 @@
 
 The implementation is written for [AutoGen](https://github.com/microsoft/autogen) in Python, although it can easily be adapted for C#.
 
+**Still work in progress, expect a lot of updates shortly**
+
 **The provided AutoGen code only implements Iterations 5 (Agentic Approach)**
 
 ## Full Logical Flow for Agentic Vector Based Approach
diff --git a/text_2_sql/autogen/agentic_text_2_sql.ipynb b/text_2_sql/autogen/agentic_text_2_sql.ipynb
@@ -1,5 +1,25 @@
 {
   "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Copyright (c) Microsoft Corporation.\n",
+        "\n",
+        "Licensed under the MIT License."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Text2SQL with AutoGen & Azure OpenAI\n",
+        "\n",
+        "This notebook demonstrates how the AutoGen Agents can be integrated with Azure OpenAI to answer questions from the database based on the schemas provided. \n",
+        "\n",
+        "A multi-shot approach is used for SQL generation for more reliable results and reduced token usage. More details can be found in the README.md."
+      ]
+    },
     {
       "cell_type": "code",
       "execution_count": null,
@@ -9,7 +29,7 @@
         "import dotenv\n",
         "import logging\n",
         "from autogen_agentchat.task import Console\n",
-        "from agentic_text_2_sql import text_2_sql_generator"
+        "from agentic_text_2_sql import AgenticText2Sql"
       ]
     },
     {
@@ -30,13 +50,29 @@
         "dotenv.load_dotenv()"
       ]
     },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Bot setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "agentic_text_2_sql = AgenticText2Sql(target_engine=\"TSQL\", engine_specific_rules=\"Use TOP X to limit the number of rows returned instead of LIMIT X. NEVER USE LIMIT X as it produces a syntax error.\").agentic_flow"
+      ]
+    },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {},
       "outputs": [],
       "source": [
-        "result = text_2_sql_generator.run_stream(task=\"What are the total number of sales within 2008?\")"
+        "result = agentic_text_2_sql.run_stream(task=\"What are the total number of sales within 2008?\")"
       ]
     },
     {
diff --git a/text_2_sql/autogen/agentic_text_2_sql.py b/text_2_sql/autogen/agentic_text_2_sql.py
@@ -7,81 +7,131 @@
 import logging
 from agents.custom_agents.sql_query_cache_agent import SqlQueryCacheAgent
 import json
+import os
 
-SQL_QUERY_GENERATION_AGENT = LLMAgentCreator.create(
-    "sql_query_generation_agent",
-    target_engine="Microsoft SQL Server",
-    engine_specific_rules="Use TOP X to limit the number of rows returned instead of LIMIT X. NEVER USE LIMIT X as it produces a syntax error.",
-)
-SQL_SCHEMA_SELECTION_AGENT = LLMAgentCreator.create(
-    "sql_schema_selection_agent",
-    use_case="Sales data for a company that specializes in selling products online.",
-)
-SQL_QUERY_CORRECTION_AGENT = LLMAgentCreator.create(
-    "sql_query_correction_agent",
-    target_engine="Microsoft SQL Server",
-    engine_specific_rules="Use TOP X to limit the number of rows returned instead of LIMIT X. NEVER USE LIMIT X as it produces a syntax error.",
-)
-SQL_QUERY_CACHE_AGENT = SqlQueryCacheAgent()
-ANSWER_AGENT = LLMAgentCreator.create("answer_agent")
-QUESTION_DECOMPOSITION_AGENT = LLMAgentCreator.create("question_decomposition_agent")
-
-
-def text_2_sql_generator_selector_func(messages):
-    logging.info("Messages: %s", messages)
-    decision = None  # Initialize decision variable
-
-    if len(messages) == 1:
-        decision = "sql_query_cache_agent"
-
-    elif (
-        messages[-1].source == "sql_query_cache_agent"
-        and messages[-1].content is not None
-    ):
-        cache_result = json.loads(messages[-1].content)
-        if cache_result.get("cached_questions_and_schemas") is not None:
+
+class AgenticText2Sql:
+    def __init__(self, target_engine: str, engine_specific_rules: str):
+        self.use_query_cache = False
+        self.pre_run_query_cache = False
+
+        self.target_engine = target_engine
+        self.engine_specific_rules = engine_specific_rules
+
+        self.set_mode()
+
+    def set_mode(self):
+        """Set the mode of the plugin based on the environment variables."""
+        self.use_query_cache = (
+            os.environ.get("Text2Sql__UseQueryCache", "False").lower() == "true"
+        )
+
+        self.pre_run_query_cache = (
+            os.environ.get("Text2Sql__PreRunQueryCache", "False").lower() == "true"
+        )
+
+    @property
+    def agents(self):
+        """Define the agents for the chat."""
+        SQL_QUERY_GENERATION_AGENT = LLMAgentCreator.create(
+            "sql_query_generation_agent",
+            target_engine=self.target_engine,
+            engine_specific_rules=self.engine_specific_rules,
+        )
+        SQL_SCHEMA_SELECTION_AGENT = LLMAgentCreator.create(
+            "sql_schema_selection_agent",
+            use_case="Sales data for a company that specializes in selling products online.",
+        )
+        SQL_QUERY_CORRECTION_AGENT = LLMAgentCreator.create(
+            "sql_query_correction_agent",
+            target_engine=self.target_engine,
+            engine_specific_rules=self.engine_specific_rules,
+        )
+
+        ANSWER_AGENT = LLMAgentCreator.create("answer_agent")
+        QUESTION_DECOMPOSITION_AGENT = LLMAgentCreator.create(
+            "question_decomposition_agent"
+        )
+
+        agents = [
+            SQL_QUERY_GENERATION_AGENT,
+            SQL_SCHEMA_SELECTION_AGENT,
+            SQL_QUERY_CORRECTION_AGENT,
+            ANSWER_AGENT,
+            QUESTION_DECOMPOSITION_AGENT,
+        ]
+
+        if self.use_query_cache:
+            SQL_QUERY_CACHE_AGENT = SqlQueryCacheAgent()
+            agents.append(SQL_QUERY_CACHE_AGENT)
+
+        return agents
+
+    @property
+    def termination_condition(self):
+        """Define the termination condition for the chat."""
+        termination = TextMentionTermination("TERMINATE") | MaxMessageTermination(10)
+        return termination
+
+    @staticmethod
+    def selector(messages):
+        logging.info("Messages: %s", messages)
+        decision = None  # Initialize decision variable
+
+        if len(messages) == 1:
+            decision = "sql_query_cache_agent"
+
+        elif (
+            messages[-1].source == "sql_query_cache_agent"
+            and messages[-1].content is not None
+        ):
+            cache_result = json.loads(messages[-1].content)
+            if cache_result.get("cached_questions_and_schemas") is not None:
+                decision = "sql_query_correction_agent"
+            else:
+                decision = "sql_schema_selection_agent"
+
+        elif messages[-1].source == "sql_query_cache_agent":
+            decision = "question_decomposition_agent"
+
+        elif messages[-1].source == "question_decomposition_agent":
+            decomposition_result = json.loads(messages[-1].content)
+
+            if len(decomposition_result["entities"]) == 1:
+                decision = "sql_schema_selection_agent"
+            else:
+                decision = "parallel_sql_flow_agent"
+
+        elif messages[-1].source == "sql_schema_selection_agent":
+            decision = "sql_query_generation_agent"
+
+        elif (
+            messages[-1].source == "sql_query_correction_agent"
+            and messages[-1].content == "VALIDATED"
+        ):
+            decision = "answer_agent"
+
+        elif messages[-1].source == "sql_query_correction_agent":
             decision = "sql_query_correction_agent"
-        else:
-            decision = "sql_schema_selection_agent"
-
-    elif messages[-1].source == "question_decomposition_agent":
-        decomposition_result = json.loads(messages[-1].content)
-
-        if len(decomposition_result["entities"]) == 1:
-            decision = "sql_schema_selection_agent"
-        else:
-            decision = "parallel_sql_flow_agent"
-
-    elif messages[-1].source == "sql_schema_selection_agent":
-        decision = "sql_query_generation_agent"
-
-    elif (
-        messages[-1].source == "sql_query_correction_agent"
-        and messages[-1].content == "VALIDATED"
-    ):
-        decision = "answer_agent"
-
-    elif messages[-1].source == "sql_query_correction_agent":
-        decision = "sql_query_correction_agent"
-
-    # Log the decision
-    logging.info("Decision: %s", decision)
-
-    return decision
-
-
-termination = TextMentionTermination("TERMINATE") | MaxMessageTermination(10)
-text_2_sql_generator = SelectorGroupChat(
-    [
-        SQL_QUERY_GENERATION_AGENT,
-        SQL_SCHEMA_SELECTION_AGENT,
-        SQL_QUERY_CORRECTION_AGENT,
-        SQL_QUERY_CACHE_AGENT,
-        ANSWER_AGENT,
-        QUESTION_DECOMPOSITION_AGENT,
-    ],
-    allow_repeated_speaker=False,
-    model_client=MINI_MODEL,
-    termination_condition=termination,
-    selector_func=text_2_sql_generator_selector_func,
-)
+
+        # Log the decision
+        logging.info("Decision: %s", decision)
+
+        return decision
+
+    @property
+    def agentic_flow(self):
+        """Run the agentic flow for the given question.
+
+        Args:
+        ----
+            question (str): The question to run the agentic flow on."""
+        agentic_flow = SelectorGroupChat(
+            self.agents,
+            allow_repeated_speaker=False,
+            model_client=MINI_MODEL,
+            termination_condition=self.termination_condition,
+            selector_func=AgenticText2Sql.selector,
+        )
+
+        return agentic_flow
diff --git a/text_2_sql/autogen/agents/custom_agents/parallel_sql_flow_agent.py b/text_2_sql/autogen/agents/custom_agents/parallel_sql_flow_agent.py
diff --git a/text_2_sql/autogen/utils/sql.py b/text_2_sql/autogen/utils/sql.py
@@ -105,10 +105,13 @@ async def query_validation(
 ) -> Union[bool | list[dict]]:
     """Validate the SQL query."""
     try:
+        logging.info("Validating SQL Query: %s", sql_query)
         sqlglot.transpile(sql_query)
     except sqlglot.errors.ParseError as e:
+        logging.error("SQL Query is invalid: %s", e.errors)
         return e.errors
     else:
+        logging.info("SQL Query is valid.")
         return True
 
 
diff --git a/text_2_sql/data_dictionary/README.md b/text_2_sql/data_dictionary/README.md
@@ -101,6 +101,6 @@ The following Databases have pre-built scripts for them:
 
 - **Databricks:** `databricks_data_dictionary_creator.py`
 - **Snowflake:** `snowflake_data_dictionary_creator.py`
-- **SQL Server:** `tsql_data_dictionary_creator.py`
+- **TSQL:** `tsql_data_dictionary_creator.py`
 
 If there is no pre-built script for your database engine, take one of the above as a starting point and adjust it.