Update query rewrite agent

BenConstable9 · BenConstable9 · commit 1eb61975bb48 · 2024-12-31T16:08:25.000Z
diff --git a/text_2_sql/autogen/src/autogen_text_2_sql/custom_agents/parallel_query_solving_agent.py b/text_2_sql/autogen/src/autogen_text_2_sql/custom_agents/parallel_query_solving_agent.py
@@ -75,7 +75,7 @@ async def consume_inner_messages_from_agentic_flow(
                 if isinstance(inner_message, TaskResult) is False:
                     try:
                         inner_message = json.loads(inner_message.content)
-                        logging.info(f"Loaded: {inner_message}")
+                        logging.info(f"Inner Loaded: {inner_message}")
 
                         # Search for specific message types and add them to the final output object
                         if (
@@ -91,6 +91,21 @@ async def consume_inner_messages_from_agentic_flow(
                                 }
                             )
 
+                        if ("contains_pre_run_results" in inner_message) and (
+                            inner_message["contains_pre_run_results"] is True
+                        ):
+                            for pre_run_sql_query, pre_run_result in inner_message[
+                                "cached_questions_and_schemas"
+                            ].items():
+                                database_results[identifier].append(
+                                    {
+                                        "sql_query": pre_run_sql_query.replace(
+                                            "\n", " "
+                                        ),
+                                        "sql_rows": pre_run_result["sql_rows"],
+                                    }
+                                )
+
                     except (JSONDecodeError, TypeError) as e:
                         logging.error("Could not load message: %s", inner_message)
                         logging.warning(f"Error processing message: {e}")
@@ -113,13 +128,15 @@ async def consume_inner_messages_from_agentic_flow(
                 self.engine_specific_rules, **self.kwargs
             )
 
+            identifier = ", ".join(query_rewrite)
+
             # Launch tasks for each sub-query
             inner_solving_generators.append(
                 consume_inner_messages_from_agentic_flow(
                     inner_autogen_text_2_sql.process_question(
                         question=query_rewrite, injected_parameters=injected_parameters
                     ),
-                    query_rewrite,
+                    identifier,
                     database_results,
                 )
             )
diff --git a/text_2_sql/autogen/src/autogen_text_2_sql/custom_agents/sql_query_cache_agent.py b/text_2_sql/autogen/src/autogen_text_2_sql/custom_agents/sql_query_cache_agent.py
@@ -39,55 +39,46 @@ async def on_messages_stream(
         self, messages: Sequence[ChatMessage], cancellation_token: CancellationToken
     ) -> AsyncGenerator[AgentMessage | Response, None]:
         # Get the decomposed questions from the query_rewrite_agent
-        parameter_input = messages[0].content
-        last_response = messages[-1].content
         try:
-            user_questions = json.loads(last_response)
-            injected_parameters = json.loads(parameter_input)["injected_parameters"]
+            request_details = json.loads(messages[0].content)
+            injected_parameters = request_details["injected_parameters"]
+            user_questions = request_details["question"]
             logging.info(f"Processing questions: {user_questions}")
             logging.info(f"Input Parameters: {injected_parameters}")
+        except json.JSONDecodeError:
+            # If not JSON array, process as single question
+            raise ValueError("Could not load message")
 
-            # Initialize results dictionary
-            cached_results = {
-                "cached_questions_and_schemas": [],
-                "contains_pre_run_results": False,
-            }
-
-            # Process each question sequentially
-            for question in user_questions:
-                # Fetch the queries from the cache based on the question
-                logging.info(f"Fetching queries from cache for question: {question}")
-                cached_query = await self.sql_connector.fetch_queries_from_cache(
-                    question, injected_parameters=injected_parameters
-                )
+        # Initialize results dictionary
+        cached_results = {
+            "cached_questions_and_schemas": [],
+            "contains_pre_run_results": False,
+        }
 
-                # If any question has pre-run results, set the flag
-                if cached_query.get("contains_pre_run_results", False):
-                    cached_results["contains_pre_run_results"] = True
+        # Process each question sequentially
+        for question in user_questions:
+            # Fetch the queries from the cache based on the question
+            logging.info(f"Fetching queries from cache for question: {question}")
+            cached_query = await self.sql_connector.fetch_queries_from_cache(
+                question, injected_parameters=injected_parameters
+            )
 
-                # Add the cached results for this question
-                if cached_query.get("cached_questions_and_schemas"):
-                    cached_results["cached_questions_and_schemas"].extend(
-                        cached_query["cached_questions_and_schemas"]
-                    )
+            # If any question has pre-run results, set the flag
+            if cached_query.get("contains_pre_run_results", False):
+                cached_results["contains_pre_run_results"] = True
 
-            logging.info(f"Final cached results: {cached_results}")
-            yield Response(
-                chat_message=TextMessage(
-                    content=json.dumps(cached_results), source=self.name
-                )
-            )
-        except json.JSONDecodeError:
-            # If not JSON array, process as single question
-            logging.info(f"Processing single question: {last_response}")
-            cached_queries = await self.sql_connector.fetch_queries_from_cache(
-                last_response
-            )
-            yield Response(
-                chat_message=TextMessage(
-                    content=json.dumps(cached_queries), source=self.name
+            # Add the cached results for this question
+            if cached_query.get("cached_questions_and_schemas"):
+                cached_results["cached_questions_and_schemas"].extend(
+                    cached_query["cached_questions_and_schemas"]
                 )
+
+        logging.info(f"Final cached results: {cached_results}")
+        yield Response(
+            chat_message=TextMessage(
+                content=json.dumps(cached_results), source=self.name
             )
+        )
 
     async def on_reset(self, cancellation_token: CancellationToken) -> None:
         pass
diff --git a/text_2_sql/text_2_sql_core/src/text_2_sql_core/connectors/sql.py b/text_2_sql/text_2_sql_core/src/text_2_sql_core/connectors/sql.py
@@ -228,7 +228,7 @@ async def fetch_queries_from_cache(
 
                 for sql_query, sql_result in zip(sql_queries, sql_results):
                     query_result_store[sql_query["SqlQuery"]] = {
-                        "result": sql_result,
+                        "sql_rows": sql_result,
                         "schemas": sql_query["Schemas"],
                     }
 
diff --git a/text_2_sql/text_2_sql_core/src/text_2_sql_core/prompts/query_rewrite_agent.yaml b/text_2_sql/text_2_sql_core/src/text_2_sql_core/prompts/query_rewrite_agent.yaml
@@ -44,7 +44,9 @@ system_message: |
         - Determine if breaking down would simplify processing
 
       3. Break Down Complex Queries:
-        - Create independent sub-queries that can be processed separately
+        - Create independent sub-queries that can be processed separately.
+        - Each sub-query should be a simple, focused task.
+        - Group dependent sub-queries together for sequential processing.
         - Ensure each sub-query is simple and focused
         - Include clear combination instructions
         - Preserve all necessary context in each sub-query
@@ -71,8 +73,8 @@ system_message: |
         Return a JSON object with sub-queries and combination instructions:
         {
           "sub_queries": [
-            "<sub_query_1>",
-            "<sub_query_2>",
+            ["<sub_query_1>"],
+            ["<sub_query_2>"],
             ...
           ],
           "combination_logic": "<instructions for combining results>",
@@ -87,9 +89,7 @@ system_message: |
     Output:
     {
       "sub_queries": [
-        "Calculate quarterly sales totals by product category for 2008",
-        "Identify categories with positive growth each quarter",
-        "For these categories, find their top selling products in 2008"
+        ["Calculate quarterly sales totals by product category for 2008", "For these categories, find their top selling products in 2008"]
       ],
       "combination_logic": "First identify growing categories from quarterly analysis, then find their best-selling products",
       "query_type": "complex"
@@ -100,7 +100,7 @@ system_message: |
     Output:
     {
       "sub_queries": [
-        "How many orders did we have in 2008?"
+        ["How many orders did we have in 2008?"]
       ],
       "combination_logic": "Direct count query, no combination needed",
       "query_type": "simple"
@@ -111,13 +111,11 @@ system_message: |
     Output:
     {
       "sub_queries": [
-        "Get total sales by product in European countries",
-        "Get total sales by product in North American countries",
-        "Calculate total market size for each region",
-        "Find top 5 products by sales in each region",
-        "Calculate market share percentages for these products"
+        ["Get total sales by product in European countries"],
+        ["Get total sales by product in North American countries"],
+        ["Calculate total market size for each region", "Find top 5 products by sales in each region"],
       ],
-      "combination_logic": "First identify top products in each region, then calculate and compare their market shares",
+      "combination_logic": "First identify top products in each region, then calculate and compare their market shares. Questions that depend on the result of each sub-query are combined.",
       "query_type": "complex"
     }
   </examples>

Original file line number	Diff line number	Diff line change
`@@ -228,7 +228,7 @@ async def fetch_queries_from_cache(`
`228`	`228`
`229`	`229`	`for sql_query, sql_result in zip(sql_queries, sql_results):`
`230`	`230`	`query_result_store[sql_query["SqlQuery"]] = {`
`231`		`- "result": sql_result,`
	`231`	`+ "sql_rows": sql_result,`
`232`	`232`	`"schemas": sql_query["Schemas"],`
`233`	`233`	`}`
`234`	`234`