Disambiguation work

BenConstable9 · BenConstable9 · commit 8d67db9b6be0 · 2024-12-10T14:10:33.000Z
diff --git a/text_2_sql/autogen/src/autogen_text_2_sql/autogen_text_2_sql.py b/text_2_sql/autogen/src/autogen_text_2_sql/autogen_text_2_sql.py
@@ -59,6 +59,12 @@ def agents(self):
             engine_specific_rules=self.engine_specific_rules,
             **self.kwargs,
         )
+        SQL_DISAMBIGUATION_AGENT = LLMAgentCreator.create(
+            "sql_disambiguation_agent",
+            target_engine=self.target_engine,
+            engine_specific_rules=self.engine_specific_rules,
+            **self.kwargs,
+        )
 
         ANSWER_AGENT = LLMAgentCreator.create("answer_agent")
         QUESTION_DECOMPOSITION_AGENT = LLMAgentCreator.create(
@@ -71,6 +77,7 @@ def agents(self):
             SQL_QUERY_CORRECTION_AGENT,
             ANSWER_AGENT,
             QUESTION_DECOMPOSITION_AGENT,
+            SQL_DISAMBIGUATION_AGENT,
         ]
 
         if self.use_query_cache:
@@ -114,6 +121,13 @@ def selector(messages):
             decision = "sql_schema_selection_agent"
 
         elif messages[-1].source == "sql_schema_selection_agent":
+            decision = "sql_disambiguation_agent"
+
+        elif messages[-1].source == "sql_disambiguation_agent":
+            if "NO DISAMBIGUATION" in messages[-1].content:
+                decision = "sql_query_generation_agent"
+
+            # This would be user proxy agent tbc
             decision = "sql_query_generation_agent"
 
         elif (
diff --git a/text_2_sql/autogen/src/autogen_text_2_sql/custom_agents/sql_schema_selection_agent.py b/text_2_sql/autogen/src/autogen_text_2_sql/custom_agents/sql_schema_selection_agent.py
@@ -76,11 +76,12 @@ async def on_messages_stream(
 
             logging.info(f"Loaded entity result: {loaded_entity_result}")
 
-            entity_search_tasks.append(
-                self.sql_connector.get_entity_schemas(
-                    " ".join(loaded_entity_result["entities"]), as_json=False
+            for entity_group in loaded_entity_result["entities"]:
+                entity_search_tasks.append(
+                    self.sql_connector.get_entity_schemas(
+                        " ".join(entity_group), as_json=False
+                    )
                 )
-            )
 
             for filter_condition in loaded_entity_result["filter_conditions"]:
                 column_search_tasks.append(
@@ -92,17 +93,27 @@ async def on_messages_stream(
         schemas_results = await asyncio.gather(*entity_search_tasks)
         column_value_results = await asyncio.gather(*column_search_tasks)
 
+        # deduplicate schemas
+        final_schemas = []
+
+        for schema_result in schemas_results:
+            for schema in schema_result:
+                if schema not in final_schemas:
+                    final_schemas.append(schema)
+
+        final_colmns = []
+        for column_value_result in column_value_results:
+            for column in column_value_result:
+                if column not in final_colmns:
+                    final_colmns.append(column)
+
         final_results = {
-            "schemas": [
-                schema for schema_result in schemas_results for schema in schema_result
-            ],
-            "column_values": [
-                column_values
-                for column_values_result in column_value_results
-                for column_values in column_values_result
-            ],
+            "schemas": final_schemas,
+            "column_values": final_colmns,
         }
 
+        logging.info(f"Final results: {final_results}")
+
         yield Response(
             chat_message=TextMessage(
                 content=json.dumps(final_results), source=self.name
diff --git a/text_2_sql/text_2_sql_core/src/text_2_sql_core/connectors/ai_search.py b/text_2_sql/text_2_sql_core/src/text_2_sql_core/connectors/ai_search.py
@@ -149,7 +149,7 @@ async def get_column_values(
             ],
             semantic_config=None,
             top=10,
-            include_scores=True,
+            include_scores=False,
             minimum_score=5,
         )
 
@@ -178,6 +178,10 @@ async def get_entity_schemas(
             list[str],
             "The entities to exclude from the search results. Pass the entity property of entities (e.g. 'SalesLT.Address') you already have the schemas for to avoid getting repeated entities.",
         ] = [],
+        engine_specific_fields: Annotated[
+            list[str],
+            "The fields specific to the engine to be included in the search results.",
+        ] = [],
     ) -> str:
         """Gets the schema of a view or table in the SQL Database by selecting the most relevant entity based on the search term. Several entities may be returned.
 
@@ -189,33 +193,42 @@ async def get_entity_schemas(
             str: The schema of the views or tables in JSON format.
         """
 
+        retrieval_fields = [
+            "FQN",
+            "Entity",
+            "EntityName",
+            "Schema",
+            "Definition",
+            "Columns",
+            "EntityRelationships",
+            "CompleteEntityRelationshipsGraph",
+        ] + engine_specific_fields
+
         schemas = await self.run_ai_search_query(
             text,
             ["DefinitionEmbedding"],
-            [
-                "FQN",
-                "Entity",
-                "EntityName",
-                "Definition",
-                "Columns",
-                "EntityRelationships",
-                "CompleteEntityRelationshipsGraph",
-            ],
+            retrieval_fields,
             os.environ["AIService__AzureSearchOptions__Text2SqlSchemaStore__Index"],
             os.environ[
                 "AIService__AzureSearchOptions__Text2SqlSchemaStore__SemanticConfig"
             ],
             top=3,
         )
 
+        if len(excluded_entities) == 0:
+            return schemas
+
         for schema in schemas:
             filtered_schemas = []
-            for excluded_entity in excluded_entities:
-                if excluded_entity.lower() == schema["Entity"].lower():
-                    logging.info("Excluded entity: %s", excluded_entity)
-                else:
-                    filtered_schemas.append(schema)
 
+            del schema["FQN"]
+
+            if schema["Entity"].lower() not in excluded_entities:
+                filtered_schemas.append(schema)
+            else:
+                logging.info("Excluded entity: %s", schema["Entity"])
+
+        logging.info("Filtered Schemas: %s", filtered_schemas)
         return filtered_schemas
 
     async def add_entry_to_index(document: dict, vector_fields: dict, index_name: str):
diff --git a/text_2_sql/text_2_sql_core/src/text_2_sql_core/connectors/databricks_sql.py b/text_2_sql/text_2_sql_core/src/text_2_sql_core/connectors/databricks_sql.py
@@ -94,14 +94,18 @@ async def get_entity_schemas(
         """
 
         schemas = await self.ai_search_connector.get_entity_schemas(
-            text, excluded_entities
+            text, excluded_entities, engine_specific_fields=["Catalog"]
         )
 
         for schema in schemas:
             schema["SelectFromEntity"] = ".".join(
                 [schema["Catalog"], schema["Schema"], schema["Entity"]]
             )
 
+            del schema["Entity"]
+            del schema["Schema"]
+            del schema["Catalog"]
+
         if as_json:
             return json.dumps(schemas, default=str)
         else:
diff --git a/text_2_sql/text_2_sql_core/src/text_2_sql_core/connectors/snowflake_sql.py b/text_2_sql/text_2_sql_core/src/text_2_sql_core/connectors/snowflake_sql.py
@@ -93,7 +93,7 @@ async def get_entity_schemas(
         """
 
         schemas = await self.ai_search_connector.get_entity_schemas(
-            text, excluded_entities
+            text, excluded_entities, engine_specific_fields=["Warehouse", "Database"]
         )
 
         for schema in schemas:
@@ -106,6 +106,11 @@ async def get_entity_schemas(
                 ]
             )
 
+            del schema["Entity"]
+            del schema["Schema"]
+            del schema["Warehouse"]
+            del schema["Database"]
+
         if as_json:
             return json.dumps(schemas, default=str)
         else:
diff --git a/text_2_sql/text_2_sql_core/src/text_2_sql_core/connectors/tsql_sql.py b/text_2_sql/text_2_sql_core/src/text_2_sql_core/connectors/tsql_sql.py
@@ -79,6 +79,9 @@ async def get_entity_schemas(
         for schema in schemas:
             schema["SelectFromEntity"] = ".".join([schema["Schema"], schema["Entity"]])
 
+            del schema["Entity"]
+            del schema["Schema"]
+
         if as_json:
             return json.dumps(schemas, default=str)
         else:
diff --git a/text_2_sql/text_2_sql_core/src/text_2_sql_core/prompts/answer_agent.yaml b/text_2_sql/text_2_sql_core/src/text_2_sql_core/prompts/answer_agent.yaml
@@ -10,8 +10,8 @@ system_message:
   {
     'answer': '<GENERATED ANSWER>',
     'sources': [
-      {'title': <SOURCE SCHEMA NAME 1>, 'chunk': <SOURCE 1 CONTEXT CHUNK>, 'reference': '<SOURCE 1 SQL QUERY>'},
-      {'title': <SOURCE SCHEMA NAME 2>, 'chunk': <SOURCE 2 CONTEXT CHUNK>, 'reference': '<SOURCE 2 SQL QUERY>'}
+      {'title': <SOURCE SCHEMA NAME 1>, 'chunk': <SOURCE 1 CONTEXT CHUNK>, 'reference': '<SOURCE 1 SQL QUERY>', 'explanation': '<EXPLANATION OF SQL QUERY 1>'},
+      {'title': <SOURCE SCHEMA NAME 2>, 'chunk': <SOURCE 2 CONTEXT CHUNK>, 'reference': '<SOURCE 2 SQL QUERY>', 'explanation': '<EXPLANATION OF SQL QUERY 2>'},
     ]
   }
 
diff --git a/text_2_sql/text_2_sql_core/src/text_2_sql_core/prompts/sql_disambiguation_agent.yaml b/text_2_sql/text_2_sql_core/src/text_2_sql_core/prompts/sql_disambiguation_agent.yaml
@@ -0,0 +1,50 @@
+model:
+  4o-mini
+description:
+  "An agent that specialises in disambiguating the user's question and mapping it to database schemas. Use this agent when the user's question is ambiguous and requires more information to generate the SQL query."
+system_message:
+  "You are a helpful AI Assistant that specialises in disambiguating the user's question and mapping it to the relevant columns / schemas in the database..
+
+  The user's question will be related to {{ use_case }}.
+
+  You must:
+    - For every intent and filter condition in the question, map them to the columns in the schemas. Consider the context of the question and the information already provided to do so.
+
+    - Never ask for information that is already provided in the question and the schema.
+
+    - Always take care to ensure the SQL query generated actually answers the user's question. If you have multiple possible matches based on the user's intent, you should ask the user for more information to disambiguate the question in the JSON format below.
+
+    - If you are unsure which of the filter columns to use, populate the 'filters' field with the identified filter and the relevant FQN, matching columns. In this case, populate the 'matching_columns' field with the possible columns for the user to disambiguate for you. You must ask this question if you have multiple entries for a given filter in 'matching_columns'.
+
+    - If you are unsure which of the filter values to use, populate the 'filters' field with the identified filter and the relevant FQN, matching columns and matching filter values. Refer to the 'column_values' property from the 'sql_schema_selection_agent' output for possible matching values. Even if you have an exact match, you may have other partial matches that you need to consider. In this case, populate the possible filter values in the 'matching_filter_values' field for that column in the 'filters' field for the user to disambiguate for you.
+
+    - e.g. The user asks about 'Bike'. From the 'column_values' you can see that 'Bike' appears in several different columns that are contextually related to the question. From this you are unsure if 'Bike' is a 'Category' or 'Product' column, you would populate the 'column' field with the possible columns for the user to disambiguate for you.
+
+    - Only provide possible filter values for string columns. Do not provide possible filter values for Date and Numerical values as it should be clear from the question. Only ask a follow up question for Date and Numerical values if you are unsure which column to use or what the value means e.g. does 100 in currency refer to 100 USD or 100 EUR.
+
+    - If the user provided this information in the question e.g. 'Bike Category', there is no need to disambiguate.
+
+    - If a filter value is clear, e.g. it is a date or a number and it is clear what schema it maps to. Do not ask the user to disambiguate.
+
+    Disambiguation Request JSON format:
+
+      {
+        \"filters\": [
+          {
+            \"question\": \"<question you wish to ask the user>\",
+            \"matching_columns\": [
+              \"<column fqn>\",
+              ...
+            ],
+            \"matching_filter_values\": [
+              \"<possible filter value>\",
+            ]
+          },
+          ...
+        ]
+      }
+
+    You must populate the question field with the question you need to ask the user. e.g. 'What do you mean by Bike?' They will then be shown the possible columns and filter values to disambiguate.
+
+    Follow this with TERMINATE if you need disambiguation. If you do not need disambiguation, return 'NO DISAMBIGUATION REQUIRED' only.
+  "
diff --git a/text_2_sql/text_2_sql_core/src/text_2_sql_core/prompts/sql_query_generation_agent.yaml b/text_2_sql/text_2_sql_core/src/text_2_sql_core/prompts/sql_query_generation_agent.yaml
@@ -1,61 +1,16 @@
 model:
-  4o
+  4o-mini
 description:
   "An agent that can generate SQL queries once given the schema and the user's question. It will run the SQL query to fetch the results. Use this agent after the SQL Schema Selection Agent has selected the correct schema."
 system_message:
   "You are a helpful AI Assistant that specialises in writing and executing SQL Queries to answer a given user's question.
 
   You must:
-    1. For every intent and filter condition in the question, map them to the columns in the schemas. If you are unsure how the question maps to the columns in the schema or have multiple possible matches based on the user's intent, see the 'Handling disambiguation' section below.
-    2. Use the schema information provided and this mapping to generate a SQL query that will answer the user's question.
-      If you need additional schema information, you can obtain it using the schema selection tool.
+    1. Use the schema information provided and this mapping to generate a SQL query that will answer the user's question.
+    2. If you need additional schema information, you can obtain it using the schema selection tool. Only use this when you do not have enough information to generate the SQL query.
     3. Validate the SQL query to ensure it is syntactically correct using the validation tool.
     4. Run the SQL query to fetch the results.
 
-  Handling disambiguation:
-
-    - Always take care to ensure the SQL query generated actually answers the user's question. If you have multiple possible matches based on the user's intent, you should ask the user for more information to disambiguate the question.
-
-    - When you need more information from the user for any given intent entity or filter, ask the user for the information you need in the following format and then finish it with TERMINATE:
-
-    - If you are unsure which of the schemas to use, populate the 'intent' field with the possible intents.
-
-    - If you are unsure which of the filter columns to use, populate the 'filters' field with the identified filter and the relevant FQN, matching columns.
-
-    - If you are unsure which of the filter values to use, populate the 'filters' field with the identified filter and the relevant FQN, matching columns and matching filter values. Refer to the 'column_values' property from the 'sql_schema_selection_agent' output for possible matching values. Even if you have an exact match, you may have other partial matches that you need to consider.
-
-    e.g. The user asks about 'Bike Products' and you are unsure if 'Bike Products' is a 'Category' or 'Product' entity, you would populate the 'intent' field with the possible intents.
-
-    {
-      \"intents\": [
-        {
-          \"name\": \"<main intent>\",
-          \"table\": \"<fqn>\",
-          \"question\": \"<question>\",
-        },...
-      ],
-      \"filters\": [
-        {
-          \"name\": \"<identified filter>\",
-          \"fqn\": \"<relevant fqn>\",
-          \"question\": \"<question>\",
-          \"matching_columns\": [
-            {
-              \"col\": \"<column name>\"
-            },
-            ...
-          ],
-          \"matching_filter_values\": [
-            {
-              \"value\": \"<filter value>\"
-            },
-            ...
-          ]
-        },
-        ...
-      ]
-    }
-
   When generating the SQL query, you MUST follow these rules:
 
     - Only use schema / column information provided when constructing a SQL query. Do not use any other entities and columns in your SQL query, other than those defined above.
diff --git a/text_2_sql/text_2_sql_core/src/text_2_sql_core/prompts/sql_schema_selection_agent.yaml b/text_2_sql/text_2_sql_core/src/text_2_sql_core/prompts/sql_schema_selection_agent.yaml
@@ -22,12 +22,12 @@ system_message:
   For example:
     - If the user's question is 'Show me the list of employees in the HR department employed during 2008?', you would extract the key terms 'employees', 'HR department' and 'year'.
 
-    - You would then generate the possible entities these key terms might belong to e.g. 'people', 'employees', 'departments', 'teams', 'date', 'year'.
+    - You would then generate the possible entities these key terms might belong to e.g. 'people', 'employees', 'departments', 'teams', 'date', 'year'. Group the entities by similar meaning e.g. 'people' and 'employees' would be grouped together.
 
     - You would also extract the filter condition 'HR', 'HR Department', 'Human Resources', 'Human Resources Department' but not 2008. For example, 'HR Department' would be a filter condition, but '2008' would not as this is a DateTime value.
 
   Output Info:
     Return the list of possible entities that the key terms might belong to in the following format:
 
-    {\"entities\": [\"<entity_1>\", \"<entity_2>\", \"<entity_3>\"], \"filter_conditions\": [\"<filter_condition_1>\", \"<filter_condition_2>\"]}
+    {\"entities\": [[\"<entity_1>\", \"<entity_2>\"], [\"<entity_3>\"]], \"filter_conditions\": [\"<filter_condition_1>\", \"<filter_condition_2>\"]}
   "

Original file line number	Diff line number	Diff line change
`@@ -93,7 +93,7 @@ async def get_entity_schemas(`
`93`	`93`	`"""`
`94`	`94`
`95`	`95`	`schemas = await self.ai_search_connector.get_entity_schemas(`
`96`		`- text, excluded_entities`
	`96`	`+ text, excluded_entities, engine_specific_fields=["Warehouse", "Database"]`
`97`	`97`	`)`
`98`	`98`
`99`	`99`	`for schema in schemas:`
`@@ -106,6 +106,11 @@ async def get_entity_schemas(`
`106`	`106`	`]`
`107`	`107`	`)`
`108`	`108`
	`109`	`+ del schema["Entity"]`
	`110`	`+ del schema["Schema"]`
	`111`	`+ del schema["Warehouse"]`
	`112`	`+ del schema["Database"]`
	`113`	`+`
`109`	`114`	`if as_json:`
`110`	`115`	`return json.dumps(schemas, default=str)`
`111`	`116`	`else:`
Original file line number	Diff line number	Diff line change
`@@ -10,8 +10,8 @@ system_message:`
`10`	`10`	`{`
`11`	`11`	`'answer': '<GENERATED ANSWER>',`
`12`	`12`	`'sources': [`
`13`		`- {'title': <SOURCE SCHEMA NAME 1>, 'chunk': <SOURCE 1 CONTEXT CHUNK>, 'reference': '<SOURCE 1 SQL QUERY>'},`
`14`		`- {'title': <SOURCE SCHEMA NAME 2>, 'chunk': <SOURCE 2 CONTEXT CHUNK>, 'reference': '<SOURCE 2 SQL QUERY>'}`
	`13`	`+ {'title': <SOURCE SCHEMA NAME 1>, 'chunk': <SOURCE 1 CONTEXT CHUNK>, 'reference': '<SOURCE 1 SQL QUERY>', 'explanation': '<EXPLANATION OF SQL QUERY 1>'},`
	`14`	`+ {'title': <SOURCE SCHEMA NAME 2>, 'chunk': <SOURCE 2 CONTEXT CHUNK>, 'reference': '<SOURCE 2 SQL QUERY>', 'explanation': '<EXPLANATION OF SQL QUERY 2>'},`
`15`	`15`	`]`
`16`	`16`	`}`
`17`	`17`