simplify

cyyeh · cyyeh · commit 9b8ab01d37e2 · 2025-09-01T14:51:11.000+08:00
diff --git a/wren-ai-service/src/pipelines/generation/intent_classification.py b/wren-ai-service/src/pipelines/generation/intent_classification.py
@@ -1,19 +1,17 @@
-import ast
 import logging
 import sys
 from typing import Any, Literal, Optional
 
 import orjson
 from hamilton import base
 from hamilton.async_driver import AsyncDriver
-from haystack import Document
 from haystack.components.builders.prompt_builder import PromptBuilder
 from langfuse.decorators import observe
 from pydantic import BaseModel
 
 from src.core.pipeline import BasicPipeline
 from src.core.provider import DocumentStoreProvider, EmbedderProvider, LLMProvider
-from src.pipelines.common import build_table_ddl, clean_up_new_lines
+from src.pipelines.common import clean_up_new_lines
 from src.pipelines.generation.utils.sql import construct_instructions
 from src.utils import trace_cost
 from src.web.v1.services import Configuration
@@ -25,7 +23,7 @@
 intent_classification_system_prompt = """
 ### Task ###
 You are an expert detective specializing in intent classification. Combine the user's current question and previous questions to determine their true intent based on the provided database schema or sql data if provided.
-Classify the intent into one of these categories: `MISLEADING_QUERY`, `TEXT_TO_SQL`, `DATA_EXPLORATION`, `GENERAL`, or `USER_GUIDE`. Additionally, provide a concise reasoning (maximum 20 words) for your classification.
+Classify the intent into one of these categories: `MISLEADING_QUERY`, `TEXT_TO_SQL`, `DATA_EXPLORATION`, `GENERAL`, `USER_GUIDE`, or `USER_CLARIFICATION`. Additionally, provide a concise reasoning (maximum 20 words) for your classification.
 
 ### Instructions ###
 - **Follow the user's previous questions:** If there are previous questions, try to understand the user's current question as following the previous questions.
@@ -34,8 +32,9 @@
 - **Rephrase Question:** Rewrite follow-up questions into full standalone questions using prior conversation context.
 - **Concise Reasoning:** The reasoning must be clear, concise, and limited to 20 words.
 - **Language Consistency:** Use the same language as specified in the user's output language for the rephrased question and reasoning.
-- **Vague Queries:** If the question is vague or does not related to a table or property from the schema, classify it as `MISLEADING_QUERY`.
-- **Incomplete Queries:** If the question is related to the database schema but references unspecified values (e.g., "the following", "these", "those") without providing them, classify as `GENERAL`.
+- **Vague Queries:** If the question does not related to the database schema, classify it as `MISLEADING_QUERY`.
+- **User Clarification:** If the question is related to the database schema, but missing some details in order to answer the question, classify it as `USER_CLARIFICATION`.
+- **Incomplete Queries:** If the question is related to the database schema but references unspecified values (e.g., "the following", "these", "those") without providing them, classify as `USER_CLARIFICATION`.
 - **Time-related Queries:** Don't rephrase time-related information in the user's question.
 
 ### Intent Definitions ###
@@ -73,9 +72,9 @@
 - "List the top 10 products by revenue."
 </TEXT_TO_SQL>
 
-<GENERAL>
-**When to Use:**  
-- The user seeks general information about the database schema or its overall capabilities.
+<USER_CLARIFICATION>
+**When to Use:**
+- The user's question is related to the database schema, but missing some details in order to answer the question.
 - The query references **missing information** (e.g., "the following items" without listing them).
 - The query contains **placeholder references** that cannot be resolved from context.
 - The query is **incomplete for SQL generation** despite mentioning database concepts.
@@ -85,11 +84,18 @@
 - Identify missing parameters, unspecified references, or incomplete filter criteria.
 
 **Examples:**
-- "What is the dataset about?"
-- "Tell me more about the database."
 - "How can I analyze customer behavior with this data?"
 - "Show me orders for these products" (without specifying which products)
 - "Filter by the criteria I mentioned" (without previous context defining criteria)
+</USER_CLARIFICATION>
+
+<GENERAL>
+**When to Use:**  
+- The user seeks general information about the database schema or its overall capabilities
+
+**Examples:**
+- "What is the dataset about?"
+- "Tell me more about the database."
 </GENERAL>
 
 <USER_GUIDE>
@@ -126,7 +132,7 @@
 {
     "rephrased_question": "<rephrased question in full standalone question if there are previous questions, otherwise the original question>",
     "reasoning": "<brief chain-of-thought reasoning (max 20 words)>",
-    "results": "MISLEADING_QUERY" | "TEXT_TO_SQL" | "DATA_EXPLORATION" |"GENERAL" | "USER_GUIDE"
+    "results": "MISLEADING_QUERY" | "TEXT_TO_SQL" | "DATA_EXPLORATION" | "GENERAL" | "USER_GUIDE" | "USER_CLARIFICATION"
 }
 """
 
@@ -183,114 +189,11 @@
 
 
 ## Start of Pipeline
-@observe(capture_input=False, capture_output=False)
-async def embedding(query: str, embedder: Any, histories: list[AskHistory]) -> dict:
-    previous_query_summaries = (
-        [history.question for history in histories] if histories else []
-    )
-
-    query = "\n".join(previous_query_summaries) + "\n" + query
-
-    return await embedder.run(query)
-
-
-@observe(capture_input=False)
-async def table_retrieval(
-    embedding: dict, project_id: str, table_retriever: Any
-) -> dict:
-    filters = {
-        "operator": "AND",
-        "conditions": [
-            {"field": "type", "operator": "==", "value": "TABLE_DESCRIPTION"},
-        ],
-    }
-
-    if project_id:
-        filters["conditions"].append(
-            {"field": "project_id", "operator": "==", "value": project_id}
-        )
-
-    return await table_retriever.run(
-        query_embedding=embedding.get("embedding"),
-        filters=filters,
-    )
-
-
-@observe(capture_input=False)
-async def dbschema_retrieval(
-    table_retrieval: dict, embedding: dict, project_id: str, dbschema_retriever: Any
-) -> list[Document]:
-    tables = table_retrieval.get("documents", [])
-    table_names = []
-    for table in tables:
-        content = ast.literal_eval(table.content)
-        table_names.append(content["name"])
-
-    logger.info(f"dbschema_retrieval with table_names: {table_names}")
-
-    table_name_conditions = [
-        {"field": "name", "operator": "==", "value": table_name}
-        for table_name in table_names
-    ]
-
-    filters = {
-        "operator": "AND",
-        "conditions": [
-            {"field": "type", "operator": "==", "value": "TABLE_SCHEMA"},
-            {"operator": "OR", "conditions": table_name_conditions},
-        ],
-    }
-
-    if project_id:
-        filters["conditions"].append(
-            {"field": "project_id", "operator": "==", "value": project_id}
-        )
-
-    results = await dbschema_retriever.run(
-        query_embedding=embedding.get("embedding"), filters=filters
-    )
-    return results["documents"]
-
-
-@observe()
-def construct_db_schemas(dbschema_retrieval: list[Document]) -> list[str]:
-    db_schemas = {}
-    for document in dbschema_retrieval:
-        content = ast.literal_eval(document.content)
-        if content["type"] == "TABLE":
-            if document.meta["name"] not in db_schemas:
-                db_schemas[document.meta["name"]] = content
-            else:
-                db_schemas[document.meta["name"]] = {
-                    **content,
-                    "columns": db_schemas[document.meta["name"]].get("columns", []),
-                }
-        elif content["type"] == "TABLE_COLUMNS":
-            if document.meta["name"] not in db_schemas:
-                db_schemas[document.meta["name"]] = {"columns": content["columns"]}
-            else:
-                if "columns" not in db_schemas[document.meta["name"]]:
-                    db_schemas[document.meta["name"]]["columns"] = content["columns"]
-                else:
-                    db_schemas[document.meta["name"]]["columns"] += content["columns"]
-
-    # remove incomplete schemas
-    db_schemas = {k: v for k, v in db_schemas.items() if "type" in v and "columns" in v}
-
-    db_schemas_in_ddl = []
-    for table_schema in list(db_schemas.values()):
-        if table_schema["type"] == "TABLE":
-            ddl, _, _ = build_table_ddl(table_schema)
-            db_schemas_in_ddl.append(ddl)
-
-    return db_schemas_in_ddl
-
-
 @observe(capture_input=False)
 def prompt(
     query: str,
     wren_ai_docs: list[dict],
-    construct_db_schemas: list[str],
+    db_schemas: list[str],
     histories: list[AskHistory],
     prompt_builder: PromptBuilder,
     sql_samples: Optional[list[dict]] = None,
@@ -301,7 +204,7 @@ def prompt(
     _prompt = prompt_builder.run(
         query=query,
         language=configuration.language,
-        db_schemas=construct_db_schemas,
+        db_schemas=db_schemas,
         histories=histories,
         sql_samples=sql_samples,
         instructions=construct_instructions(
@@ -321,21 +224,19 @@ async def classify_intent(prompt: dict, generator: Any, generator_name: str) ->
 
 
 @observe(capture_input=False)
-def post_process(classify_intent: dict, construct_db_schemas: list[str]) -> dict:
+def post_process(classify_intent: dict) -> dict:
     try:
         results = orjson.loads(classify_intent.get("replies")[0])
         return {
             "rephrased_question": results["rephrased_question"],
             "intent": results["results"],
             "reasoning": results["reasoning"],
-            "db_schemas": construct_db_schemas,
         }
     except Exception:
         return {
             "rephrased_question": "",
             "intent": "TEXT_TO_SQL",
             "reasoning": "",
-            "db_schemas": construct_db_schemas,
         }
 
 
@@ -350,6 +251,7 @@ class IntentClassificationResult(BaseModel):
         "GENERAL",
         "DATA_EXPLORATION",
         "USER_GUIDE",
+        "USER_CLARIFICATION",
     ]
     reasoning: str
 
@@ -408,6 +310,7 @@ def __init__(
     async def run(
         self,
         query: str,
+        db_schemas: list[str],
         project_id: Optional[str] = None,
         histories: Optional[list[AskHistory]] = None,
         sql_samples: Optional[list[dict]] = None,
@@ -420,6 +323,7 @@ async def run(
             ["post_process"],
             inputs={
                 "query": query,
+                "db_schemas": db_schemas,
                 "project_id": project_id or "",
                 "histories": histories or [],
                 "sql_samples": sql_samples or [],
diff --git a/wren-ai-service/src/web/v1/services/ask.py b/wren-ai-service/src/web/v1/services/ask.py
@@ -83,14 +83,26 @@ class _AskResultResponse(BaseModel):
     trace_id: Optional[str] = None
     is_followup: bool = False
     general_type: Optional[
-        Literal["MISLEADING_QUERY", "DATA_ASSISTANCE", "USER_GUIDE", "DATA_EXPLORATION"]
+        Literal[
+            "MISLEADING_QUERY",
+            "DATA_ASSISTANCE",
+            "USER_GUIDE",
+            "DATA_EXPLORATION",
+            "USER_CLARIFICATION",
+        ]
     ] = None
 
 
 class AskResultResponse(_AskResultResponse):
     is_followup: Optional[bool] = Field(False, exclude=True)
     general_type: Optional[
-        Literal["MISLEADING_QUERY", "DATA_ASSISTANCE", "USER_GUIDE", "DATA_EXPLORATION"]
+        Literal[
+            "MISLEADING_QUERY",
+            "DATA_ASSISTANCE",
+            "USER_GUIDE",
+            "DATA_EXPLORATION",
+            "USER_CLARIFICATION",
+        ]
     ] = Field(None, exclude=True)
 
 
@@ -206,7 +218,11 @@ async def ask(
                     sql_generation_reasoning = ""
                 else:
                     # Run both pipeline operations concurrently
-                    sql_samples_task, instructions_task = await asyncio.gather(
+                    (
+                        sql_samples_task,
+                        instructions_task,
+                        db_schema_retrieval_task,
+                    ) = await asyncio.gather(
                         self._pipelines["sql_pairs_retrieval"].run(
                             query=user_query,
                             project_id=ask_request.project_id,
@@ -216,6 +232,12 @@ async def ask(
                             project_id=ask_request.project_id,
                             scope="sql",
                         ),
+                        self._pipelines["db_schema_retrieval"].run(
+                            query=user_query,
+                            histories=histories,
+                            project_id=ask_request.project_id,
+                            enable_column_pruning=enable_column_pruning,
+                        ),
                     )
 
                     # Extract results from completed tasks
@@ -225,6 +247,12 @@ async def ask(
                     instructions = instructions_task["formatted_output"].get(
                         "documents", []
                     )
+                    _retrieval_result = db_schema_retrieval_task.get(
+                        "construct_retrieval_results", {}
+                    )
+                    documents = _retrieval_result.get("retrieval_results", [])
+                    table_names = [document.get("table_name") for document in documents]
+                    table_ddls = [document.get("table_ddl") for document in documents]
 
                     if self._allow_intent_classification:
                         last_sql_data = None
@@ -240,6 +268,7 @@ async def ask(
                         intent_classification_result = (
                             await self._pipelines["intent_classification"].run(
                                 query=user_query,
+                                db_schemas=table_ddls,
                                 histories=histories,
                                 sql_samples=sql_samples,
                                 instructions=instructions,
@@ -368,19 +397,6 @@ async def ask(
                     is_followup=True if histories else False,
                 )
 
-                retrieval_result = await self._pipelines["db_schema_retrieval"].run(
-                    query=user_query,
-                    histories=histories,
-                    project_id=ask_request.project_id,
-                    enable_column_pruning=enable_column_pruning,
-                )
-                _retrieval_result = retrieval_result.get(
-                    "construct_retrieval_results", {}
-                )
-                documents = _retrieval_result.get("retrieval_results", [])
-                table_names = [document.get("table_name") for document in documents]
-                table_ddls = [document.get("table_ddl") for document in documents]
-
                 if not documents:
                     logger.exception(f"ask pipeline - NO_RELEVANT_DATA: {user_query}")
                     if not self._is_stopped(query_id, self._ask_results):