update

cyyeh · cyyeh · commit c85985e6c770 · 2025-09-01T14:51:11.000+08:00
diff --git a/deployment/kustomizations/base/cm.yaml b/deployment/kustomizations/base/cm.yaml
@@ -172,6 +172,8 @@ data:
         llm: litellm_llm.default
       - name: data_assistance
         llm: litellm_llm.default
+      - name: data_exploration_assistance
+        llm: litellm_llm.default
       - name: sql_pairs_indexing
         document_store: qdrant
         embedder: litellm_embedder.default
diff --git a/docker/config.example.yaml b/docker/config.example.yaml
@@ -122,6 +122,8 @@ pipes:
     llm: litellm_llm.default
   - name: data_assistance
     llm: litellm_llm.default
+  - name: data_exploration_assistance
+    llm: litellm_llm.default
   - name: sql_pairs_indexing
     document_store: qdrant
     embedder: litellm_embedder.default
diff --git a/wren-ai-service/src/globals.py b/wren-ai-service/src/globals.py
@@ -146,6 +146,13 @@ def create_service_container(
                     **pipe_components["followup_sql_generation"],
                 ),
                 "sql_functions_retrieval": _sql_functions_retrieval_pipeline,
+                "sql_executor": retrieval.SQLExecutor(
+                    **pipe_components["sql_executor"],
+                    engine_timeout=settings.engine_timeout,
+                ),
+                "data_exploration_assistance": generation.DataExplorationAssistance(
+                    **pipe_components["data_exploration_assistance"],
+                ),
             },
             allow_intent_classification=settings.allow_intent_classification,
             allow_sql_generation_reasoning=settings.allow_sql_generation_reasoning,
diff --git a/wren-ai-service/src/pipelines/generation/__init__.py b/wren-ai-service/src/pipelines/generation/__init__.py
@@ -1,6 +1,7 @@
 from .chart_adjustment import ChartAdjustment
 from .chart_generation import ChartGeneration
 from .data_assistance import DataAssistance
+from .data_exploration_assistance import DataExplorationAssistance
 from .followup_sql_generation import FollowUpSQLGeneration
 from .followup_sql_generation_reasoning import FollowUpSQLGenerationReasoning
 from .intent_classification import IntentClassification
@@ -36,4 +37,5 @@
     "FollowUpSQLGenerationReasoning",
     "MisleadingAssistance",
     "SQLTablesExtraction",
+    "DataExplorationAssistance",
 ]
diff --git a/wren-ai-service/src/pipelines/generation/data_exploration_assistance.py b/wren-ai-service/src/pipelines/generation/data_exploration_assistance.py
@@ -0,0 +1,141 @@
+import asyncio
+import logging
+import sys
+from typing import Any, Optional
+
+from hamilton import base
+from hamilton.async_driver import AsyncDriver
+from haystack.components.builders.prompt_builder import PromptBuilder
+from langfuse.decorators import observe
+
+from src.core.pipeline import BasicPipeline
+from src.core.provider import LLMProvider
+
+logger = logging.getLogger("wren-ai-service")
+
+
+data_exploration_assistance_system_prompt = """
+You are a great data analyst good at exploring data.
+You are given a user question and a sql data.
+You need to understand the user question and the sql data, and then answer the user question.
+### INSTRUCTIONS ###
+1. Your answer should be in the same language as the language user provided.
+2. You must follow the sql data to answer the user question.
+3. You should provide your answer in Markdown format.
+4. You have the following skills:
+- explain the data in a easy to understand manner
+- provide insights and trends in the data
+- find out anomalies and outliers in the data
+5. You only need to use the skills required to answer the user question based on the user question and the sql data.
+### OUTPUT FORMAT ###
+Please provide your response in proper Markdown format without ```markdown``` tags.
+"""
+
+data_exploration_assistance_user_prompt_template = """
+User Question: {{query}}
+Language: {{language}}
+SQL Data:
+{{ sql_data }}
+Please think step by step.
+"""
+
+
+## Start of Pipeline
+@observe(capture_input=False)
+def prompt(
+    query: str,
+    language: str,
+    sql_data: dict,
+    prompt_builder: PromptBuilder,
+) -> dict:
+    return prompt_builder.run(
+        query=query,
+        language=language,
+        sql_data=sql_data,
+    )
+
+
+@observe(as_type="generation", capture_input=False)
+async def data_exploration_assistance(
+    prompt: dict, generator: Any, query_id: str
+) -> dict:
+    return await generator(prompt=prompt.get("prompt"), query_id=query_id)
+
+
+## End of Pipeline
+
+
+class DataExplorationAssistance(BasicPipeline):
+    def __init__(
+        self,
+        llm_provider: LLMProvider,
+        **kwargs,
+    ):
+        self._user_queues = {}
+        self._components = {
+            "generator": llm_provider.get_generator(
+                system_prompt=data_exploration_assistance_system_prompt,
+                streaming_callback=self._streaming_callback,
+            ),
+            "prompt_builder": PromptBuilder(
+                template=data_exploration_assistance_user_prompt_template
+            ),
+        }
+
+        super().__init__(
+            AsyncDriver({}, sys.modules[__name__], result_builder=base.DictResult())
+        )
+
+    def _streaming_callback(self, chunk, query_id):
+        if query_id not in self._user_queues:
+            self._user_queues[
+                query_id
+            ] = asyncio.Queue()  # Create a new queue for the user if it doesn't exist
+        # Put the chunk content into the user's queue
+        asyncio.create_task(self._user_queues[query_id].put(chunk.content))
+        if chunk.meta.get("finish_reason"):
+            asyncio.create_task(self._user_queues[query_id].put("<DONE>"))
+
+    async def get_streaming_results(self, query_id):
+        async def _get_streaming_results(query_id):
+            return await self._user_queues[query_id].get()
+
+        if query_id not in self._user_queues:
+            self._user_queues[query_id] = asyncio.Queue()
+
+        while True:
+            try:
+                # Wait for an item from the user's queue
+                self._streaming_results = await asyncio.wait_for(
+                    _get_streaming_results(query_id), timeout=120
+                )
+                if (
+                    self._streaming_results == "<DONE>"
+                ):  # Check for end-of-stream signal
+                    del self._user_queues[query_id]
+                    break
+                if self._streaming_results:  # Check if there are results to yield
+                    yield self._streaming_results
+                    self._streaming_results = ""  # Clear after yielding
+            except TimeoutError:
+                break
+
+    @observe(name="Data Exploration Assistance")
+    async def run(
+        self,
+        query: str,
+        sql_data: dict,
+        language: str,
+        query_id: Optional[str] = None,
+    ):
+        logger.info("Data Exploration Assistance pipeline is running...")
+        return await self._pipe.execute(
+            ["data_exploration_assistance"],
+            inputs={
+                "query": query,
+                "language": language,
+                "query_id": query_id or "",
+                "sql_data": sql_data,
+                **self._components,
+            },
+        )
diff --git a/wren-ai-service/src/pipelines/generation/intent_classification.py b/wren-ai-service/src/pipelines/generation/intent_classification.py
@@ -24,7 +24,8 @@
 
 intent_classification_system_prompt = """
 ### Task ###
-You are an expert detective specializing in intent classification. Combine the user's current question and previous questions to determine their true intent based on the provided database schema. Classify the intent into one of these categories: `MISLEADING_QUERY`, `TEXT_TO_SQL`, `GENERAL`, or `USER_GUIDE`. Additionally, provide a concise reasoning (maximum 20 words) for your classification.
+You are an expert detective specializing in intent classification. Combine the user's current question and previous questions to determine their true intent based on the provided database schema or sql data if provided.
+Classify the intent into one of these categories: `MISLEADING_QUERY`, `TEXT_TO_SQL`, `DATA_EXPLORATION`, `GENERAL`, or `USER_GUIDE`. Additionally, provide a concise reasoning (maximum 20 words) for your classification.
 
 ### Instructions ###
 - **Follow the user's previous questions:** If there are previous questions, try to understand the user's current question as following the previous questions.
@@ -39,6 +40,19 @@
 
 ### Intent Definitions ###
 
+<DATA_EXPLORATION>
+**When to Use:**
+- The user's question is about data exploration such as asking for data details, asking for explanation of the data, asking for insights, asking for recommendations, asking for comparison, etc.
+**Requirements:**
+- SQL DATA is provided and the user's question is about exploring the data.
+- The user's question can be answered by the SQL DATA.
+- The row size of the SQL DATA is less than 500.
+**Examples:**  
+- "Show me the part where the data appears abnormal"
+- "Please explain the data in the table"
+- "What's the trend of the data?"
+</DATA_EXPLORATION>
+
 <TEXT_TO_SQL>
 **When to Use:**  
 - The user's inputs are about modifying SQL from previous questions.
@@ -51,6 +65,7 @@
 - Must have complete filter criteria, specific values, or clear references to previous context.
 - Include specific table and column names from the schema in your reasoning or modifying SQL from previous questions.
 - Reference phrases from the user's inputs that clearly relate to the schema.
+- The SQL DATA is not provided or SQL DATA cannot answer the user's question, and the user's question can be answered given the database schema.
 
 **Examples:**  
 - "What is the total sales for last quarter?"
@@ -111,7 +126,7 @@
 {
     "rephrased_question": "<rephrased question in full standalone question if there are previous questions, otherwise the original question>",
     "reasoning": "<brief chain-of-thought reasoning (max 20 words)>",
-    "results": "MISLEADING_QUERY" | "TEXT_TO_SQL" | "GENERAL" | "USER_GUIDE"
+    "results": "MISLEADING_QUERY" | "TEXT_TO_SQL" | "DATA_EXPLORATION" |"GENERAL" | "USER_GUIDE"
 }
 """
 
@@ -143,6 +158,12 @@
 - {{doc.path}}: {{doc.content}}
 {% endfor %}
 
+{% if sql_data %}
+### SQL DATA ###
+{{ sql_data }}
+row size of SQL DATA: {{ sql_data_size }}
+{% endif %}
+
 ### INPUT ###
 {% if histories %}
 User's previous questions:
@@ -275,6 +296,7 @@ def prompt(
     sql_samples: Optional[list[dict]] = None,
     instructions: Optional[list[dict]] = None,
     configuration: Configuration | None = None,
+    sql_data: Optional[dict] = None,
 ) -> dict:
     _prompt = prompt_builder.run(
         query=query,
@@ -286,6 +308,8 @@ def prompt(
             instructions=instructions,
         ),
         docs=wren_ai_docs,
+        sql_data=sql_data,
+        sql_data_size=len(sql_data.get("data", [])),
     )
     return {"prompt": clean_up_new_lines(_prompt.get("prompt"))}
 
@@ -320,7 +344,13 @@ def post_process(classify_intent: dict, construct_db_schemas: list[str]) -> dict
 
 class IntentClassificationResult(BaseModel):
     rephrased_question: str
-    results: Literal["MISLEADING_QUERY", "TEXT_TO_SQL", "GENERAL", "USER_GUIDE"]
+    results: Literal[
+        "MISLEADING_QUERY",
+        "TEXT_TO_SQL",
+        "GENERAL",
+        "DATA_EXPLORATION",
+        "USER_GUIDE",
+    ]
     reasoning: str
 
 
@@ -383,6 +413,7 @@ async def run(
         sql_samples: Optional[list[dict]] = None,
         instructions: Optional[list[dict]] = None,
         configuration: Configuration = Configuration(),
+        sql_data: Optional[dict] = None,
     ):
         logger.info("Intent Classification pipeline is running...")
         return await self._pipe.execute(
@@ -394,6 +425,7 @@ async def run(
                 "sql_samples": sql_samples or [],
                 "instructions": instructions or [],
                 "configuration": configuration,
+                "sql_data": sql_data or {},
                 **self._components,
                 **self._configs,
             },
diff --git a/wren-ai-service/src/web/v1/services/ask.py b/wren-ai-service/src/web/v1/services/ask.py
@@ -83,14 +83,14 @@ class _AskResultResponse(BaseModel):
     trace_id: Optional[str] = None
     is_followup: bool = False
     general_type: Optional[
-        Literal["MISLEADING_QUERY", "DATA_ASSISTANCE", "USER_GUIDE"]
+        Literal["MISLEADING_QUERY", "DATA_ASSISTANCE", "USER_GUIDE", "DATA_EXPLORATION"]
     ] = None
 
 
 class AskResultResponse(_AskResultResponse):
     is_followup: Optional[bool] = Field(False, exclude=True)
     general_type: Optional[
-        Literal["MISLEADING_QUERY", "DATA_ASSISTANCE", "USER_GUIDE"]
+        Literal["MISLEADING_QUERY", "DATA_ASSISTANCE", "USER_GUIDE", "DATA_EXPLORATION"]
     ] = Field(None, exclude=True)
 
 
@@ -227,6 +227,16 @@ async def ask(
                     )
 
                     if self._allow_intent_classification:
+                        last_sql_data = None
+                        if histories:
+                            if last_sql := histories[-1].sql:
+                                last_sql_data = (
+                                    await self._pipelines["sql_executor"].run(
+                                        sql=last_sql,
+                                        project_id=ask_request.project_id,
+                                    )
+                                )["execute_sql"]["results"]
+
                         intent_classification_result = (
                             await self._pipelines["intent_classification"].run(
                                 query=user_query,
@@ -235,6 +245,7 @@ async def ask(
                                 instructions=instructions,
                                 project_id=ask_request.project_id,
                                 configuration=ask_request.configurations,
+                                sql_data=last_sql_data,
                             )
                         ).get("post_process", {})
                         intent = intent_classification_result.get("intent")
@@ -317,6 +328,27 @@ async def ask(
                             )
                             results["metadata"]["type"] = "GENERAL"
                             return results
+                        elif intent == "DATA_EXPLORATION":
+                            asyncio.create_task(
+                                self._pipelines["data_exploration_assistance"].run(
+                                    query=user_query,
+                                    sql_data=last_sql_data,
+                                    language=ask_request.configurations.language,
+                                    query_id=ask_request.query_id,
+                                )
+                            )
+
+                            self._ask_results[query_id] = AskResultResponse(
+                                status="finished",
+                                type="GENERAL",
+                                rephrased_question=rephrased_question,
+                                intent_reasoning=intent_reasoning,
+                                trace_id=trace_id,
+                                is_followup=True if histories else False,
+                                general_type="DATA_EXPLORATION",
+                            )
+                            results["metadata"]["type"] = "GENERAL"
+                            return results
                         else:
                             self._ask_results[query_id] = AskResultResponse(
                                 status="understanding",
@@ -639,6 +671,8 @@ async def get_ask_streaming_result(
                     _pipeline_name = "data_assistance"
                 elif self._ask_results.get(query_id).general_type == "MISLEADING_QUERY":
                     _pipeline_name = "misleading_assistance"
+                elif self._ask_results.get(query_id).general_type == "DATA_EXPLORATION":
+                    _pipeline_name = "data_exploration_assistance"
             elif self._ask_results.get(query_id).status == "planning":
                 if self._ask_results.get(query_id).is_followup:
                     _pipeline_name = "followup_sql_generation_reasoning"
diff --git a/wren-ai-service/tools/config/config.example.yaml b/wren-ai-service/tools/config/config.example.yaml
@@ -135,6 +135,8 @@ pipes:
     llm: litellm_llm.default
   - name: data_assistance
     llm: litellm_llm.default
+  - name: data_exploration_assistance
+    llm: litellm_llm.default
   - name: sql_pairs_indexing
     document_store: qdrant
     embedder: litellm_embedder.default
diff --git a/wren-ai-service/tools/config/config.full.yaml b/wren-ai-service/tools/config/config.full.yaml
@@ -135,6 +135,8 @@ pipes:
     llm: litellm_llm.default
   - name: data_assistance
     llm: litellm_llm.default
+  - name: data_exploration_assistance
+    llm: litellm_llm.default
   - name: sql_pairs_indexing
     document_store: qdrant
     embedder: litellm_embedder.default