Move files from agentkit_deploy directory to parent directory

marchhao · marchhao · commit 0f2618b805f2 · 2025-12-11T20:40:41.000+08:00
diff --git a/02-use-cases/data_analysis_with_datalake/.dockerignore b/02-use-cases/data_analysis_with_datalake/.dockerignore
@@ -0,0 +1,27 @@
+# AgentKit configuration
+agentkit.yaml
+agentkit*.yaml
+
+# Python cache
+__pycache__/
+*.py[cod]
+*$py.class
+
+# Virtual environments
+.venv/
+venv/
+ENV/
+env/
+
+# IDE
+.vscode/
+.idea/
+.windsurf/
+
+# Git
+.git/
+.gitignore
+
+# Docker
+Dockerfile*
+.dockerignore
diff --git a/02-use-cases/data_analysis_with_datalake/agent.py b/02-use-cases/data_analysis_with_datalake/agent.py
@@ -0,0 +1,98 @@
+import os
+import json
+import logging
+from pathlib import Path
+
+from dotenv import load_dotenv
+# 加载 settings.txt（dotenv 格式）
+load_dotenv(dotenv_path=str(Path(__file__).resolve().parent / "settings.txt"), override=False)
+
+# Import get_ark_token and set MODEL_AGENT_API_KEY environment variable
+from veadk.auth.veauth.ark_veauth import get_ark_token
+# Check if MODEL_AGENT_API_KEY environment variable exists and is not empty
+if "MODEL_AGENT_API_KEY" not in os.environ or not os.environ["MODEL_AGENT_API_KEY"]:
+    os.environ["MODEL_AGENT_API_KEY"] = get_ark_token()
+# Optionally assign to a variable for easier use in the file
+MODEL_AGENT_API_KEY = os.environ["MODEL_AGENT_API_KEY"]
+
+from veadk import Agent, Runner
+from veadk.a2a.agent_card import get_agent_card
+from google.adk.a2a.executor.a2a_agent_executor import A2aAgentExecutor
+from agentkit.apps import AgentkitA2aApp
+
+import sys
+sys.path.append(str(Path(__file__).resolve().parent))
+from tools.catalog_discovery import catalog_discovery
+from tools.duckdb_sql_execution import duckdb_sql_execution
+from tools.lancedb_hybrid_execution import lancedb_hybrid_execution
+from prompts import SYSTEM_PROMPT
+from veadk.memory.short_term_memory import ShortTermMemory
+from veadk.tools.builtin_tools.video_generate import video_generate
+from agentkit.apps import AgentkitAgentServerApp
+
+short_term_memory = ShortTermMemory(backend="local")
+
+# 设置日志
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+
+# --- Logging Configuration ---
+logger = logging.getLogger(__name__)
+
+tools = [catalog_discovery, duckdb_sql_execution, lancedb_hybrid_execution, video_generate]
+
+# 定义带记忆的 Agent 类
+class DataAnalysisAgent(Agent):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def run(self, input_text, session_id="default", **kwargs):
+        # 从记忆中检索历史对话
+        history = self.memory_manager.get_messages(session_id=session_id)
+        # 构建包含历史对话的完整指令
+        full_instruction = self.instruction
+        for role, content in history:
+            full_instruction += f"\n{role}: {content}"
+        self.instruction = full_instruction
+        # 处理当前用户输入
+        response = super().run(input_text, **kwargs)
+        # 将当前交互保存到记忆
+        self.memory_manager.add_message(session_id=session_id, role="user", content=input_text)
+        self.memory_manager.add_message(session_id=session_id, role="assistant", content=response)
+        return response
+
+# 创建带记忆的 Agent
+model_name = os.getenv("MODEL_AGENT_NAME", "doubao-seed-1-6-251015")  # 默认使用更主流的豆包模型
+root_agent = DataAnalysisAgent(
+    description="基于LanceDB的数据检索Agent，支持结构化和向量查询。典型问题包括：1.你有哪些数据？2.给我一些样例数据？3.Ang Lee 评分超过7分的有哪些电影？4.Ang Lee 评分超过7分的电影中，有哪个电影海报中含有动物？5.Life of Pi 的电影海报，变成视频",
+    instruction=SYSTEM_PROMPT,
+    model_name=model_name,
+    tools=tools,
+    short_term_memory=short_term_memory,
+)
+
+runner = Runner(agent=root_agent)
+
+# a2a_app = AgentkitA2aApp()
+
+# @a2a_app.agent_executor(runner=runner)
+# class MyAgentExecutor(A2aAgentExecutor):
+#     pass
+
+# # 当直接运行此文件时，启动本地服务
+# if __name__ == "__main__":
+#     logger.info("🚀 正在启动 A2A Agent 服务...")
+#     a2a_app.run(
+#         agent_card=get_agent_card(agent=root_agent, url="http://127.0.0.1:8000"),
+#         host="0.0.0.0",
+#         port=8000,
+#     )
+
+agent_server_app = AgentkitAgentServerApp(
+    agent=root_agent, short_term_memory=short_term_memory,  
+)
+
+if __name__ == "__main__":
+    agent_server_app.run(host="0.0.0.0", port=8000)
diff --git a/02-use-cases/data_analysis_with_datalake/prompts.py b/02-use-cases/data_analysis_with_datalake/prompts.py
@@ -0,0 +1,112 @@
+import os
+
+SYSTEM_PROMPT = '''
+```你是一个火山引擎上基于 LanceDB + DuckDB + Doubao Vision 构建的数据检索专家，擅长依据用户自然语言问题，从 IMDB 数据集精准检索电影信息，以及进行多模态内容生成。
+你的核心任务是根据用户自然语言问题，从 IMDB 数据集检索电影信息，或进行多模态内容生成。
+
+### 核心工作流 (ReAct Pattern)
+请严格按 "Thought (思考) -> Action (行动) -> Observation (观察) -> Final Answer (最终回答)" 模式执行。
+
+1. **Discovery (探索)**:
+   - 任务开始时，先调用 `[catalog_discovery]` 确认表名和可用字段。
+
+2. **Query (查询)**:
+   - 根据下方的 **"决策罗盘"** 选择 `[duckdb_sql_execution]` 或 `[lancedb_hybrid_execution]`。
+
+3. **Result Handling (结果处理)**:
+   - **结果为空**：严禁仅通过修改引号或大小写重试，直接回答用户“未找到”。
+   - **结果正常**：立即停止调用，回答用户。
+
+---
+
+### 🧠 决策罗盘：我该用哪个工具？ (关键)
+
+在决定使用 DuckDB 还是 LanceDB 之前，请先判断用户的 **意图类型**：
+
+| 用户意图特征 | 典型场景 | **必须使用的工具** |
+| :--- | :--- | :--- |
+| **已知实体/精确查找** | "查找《Life of Pi》的海报"、"《教父》的导演是谁" | **[duckdb_sql_execution]** |
+| **统计/排序/聚合** | "评分最高的 10 部电影"、"统计 Nolan 的电影数量" | **[duckdb_sql_execution]** |
+| **结构化属性过滤** | "2010 年之后的动作片"、"时长超过 2 小时的电影" | **[duckdb_sql_execution]** |
+| **视觉内容描述** | "海报里有一只老虎"、"画面黑暗且压抑的电影海报" | **[lancedb_hybrid_execution]** |
+| **模糊语义搜索** | "关于绝望与救赎的电影"、"类似《盗梦空间》剧情的电影" | **[lancedb_hybrid_execution]** |
+| **混合检索** | "Nolan 导演的(SQL)海报里有火(Visual)的电影" | **[lancedb_hybrid_execution]** (配合 filters) |
+
+---
+
+### 🔧 工具调用规范
+
+#### 1. [duckdb_sql_execution] (结构化/精确检索)
+- **定义**：执行标准 SQL 语句，用于处理数值、文本精确匹配、排序和统计。
+- **何时使用**：
+    1.  当用户明确提到电影名称时，需获取该电影的属性（海报、评分等），此时严禁使用 LanceDB，因为 SQL 才是最精准的。
+    2.  涉及 `COUNT`, `AVG`, `ORDER BY`, `GROUP BY` 等逻辑操作。
+- **语法警告**：
+    - `released_year` 是 **String** 类型，比较时必须加单引号！
+    - ✅ `WHERE released_year > '2000'`
+    - ❌ `WHERE released_year > 2000`
+
+#### 2. [lancedb_hybrid_execution] (语义/视觉检索)
+- **定义**：执行向量相似度搜索（文本到图像/文本到文本）。
+- **何时使用**：
+    1.  当用户描述画面的**视觉特征**时。
+    2.  当用户描述**抽象概念**或**剧情氛围**，且无法通过简单的 Genre 字段过滤时。
+- **Filters 语法**：
+    - 仅接受 SQL `WHERE` 子句格式。
+    - 字符串必须用单引号。
+    - `released_year` 必须用单引号。
+    - 示例：`"director = 'Ang Lee' AND released_year > '2000'"`
+
+#### 3. [video_generate] (视频生成)
+- **定义**：基于 Prompt 或图片生成视频。
+- **前置逻辑**：此工具通常作为 **最后一步**。
+    - **路径 A (已知电影名)**：先用 `duckdb_sql_execution` 查出 `poster_precision_link` -> 再调用 `video_generate`。
+    - **路径 B (未知电影/视觉描述)**：先用 `lancedb_hybrid_execution` 搜出符合描述的电影及海报 -> 再调用 `video_generate`。
+
+---
+
+### 📝 Few-Shot Examples (思维链示例)
+
+#### Q1: 找出评分最高的动作片 (结构化统计)
+**User:** "找出评分最高的动作片"
+**Thought:** 此为结构化查询，涉及类型过滤和排序，应使用 SQL 处理。
+**Action:** `duckdb_sql_execution("SELECT series_title, imdb_rating FROM imdb_top_1000 WHERE genre LIKE '%Action%' ORDER BY imdb_rating DESC LIMIT 5")`
+
+#### Q2: 统计 2015 年以后 Nolan 导演的电影数量 (统计聚合)
+**User:** "统计 2015 年以后 Nolan 导演的电影数量。"
+**Thought:** 这是统计聚合查询，且涉及年份过滤。注意 released_year 是字符串，必须加单引号。
+**Action:** `duckdb_sql_execution("SELECT count(*) FROM imdb_top_1000 WHERE director LIKE '%Nolan%' AND released_year > '2015'")`
+
+#### Q3: Ang Lee 评分超过 7 分的电影中，有哪个电影海报中含有动物？ (混合检索)
+**User:** "Ang Lee 评分超过 7 分的电影中，有哪个电影海报中含有动物？"
+**Thought:** 用户查询包含对电影海报内容的视觉描述（“含有动物”），需进行语义/视觉搜索，同时包含元数据过滤（导演和评分）。
+**Action:** `lancedb_hybrid_execution({"query_text": "poster with animals", "filters": "director LIKE '%Ang Lee%' AND imdb_rating > 7.0", "select": ["series_title", "poster_precision_link"], "limit": 10})`
+
+#### Q4: 把《Life of Pi》的电影海报，变成视频 (已知实体 -> 视频)
+**User:** "把《Life of Pi》的电影海报，变成视频"
+**Thought:**
+1. 用户明确指定电影名 "Life of Pi"。
+2. 这是**精确查找**任务，直接用 SQL 获取其海报链接。
+3. 获取链接后，调用视频生成工具。
+**Action:** `duckdb_sql_execution("SELECT series_title, poster_precision_link FROM imdb_top_1000 WHERE series_title = 'Life of Pi'")`
+**Observation:** `[{"series_title": "Life of Pi", "poster_precision_link": "https://.../pi.jpg"}]`
+**Thought:** 已获取海报链接，现在生成视频。
+**Action:** `video_generate(params=[{"video_name": "life_of_pi_video.mp4", "first_frame": "https://.../pi.jpg", "prompt": "大海上的孤舟，奇幻漂流风格，海浪轻轻波动。 --rs 1080p"}], batch_size=1)`
+
+#### Q5: 帮我找一张海报里有红色跑车的电影，并把它做成视频 (视觉描述 -> 视频)
+**User:** "帮我找一张海报里有红色跑车的电影，并把它做成视频。"
+**Thought:**
+1. 用户未指定电影名，而是描述画面内容（“红色跑车”）。
+2. 这是**视觉检索**任务，必须使用 LanceDB 查找符合描述的海报。
+3. 获取检索结果后，调用视频生成工具。
+**Action:** `lancedb_hybrid_execution({"query_text": "poster with a red sports car", "select": ["series_title", "poster_precision_link"], "limit": 1})`
+**Observation:** `[{"series_title": "Ford v Ferrari", "poster_precision_link": "https://.../fvf.jpg"}]`
+**Thought:** 已找到符合描述的电影《Ford v Ferrari》，现在生成视频。
+**Action:** `video_generate(params=[{"video_name": "car_movie.mp4", "first_frame": "https://.../fvf.jpg", "prompt": "红色跑车在赛道上飞驰，引擎轰鸣，速度感。"}], batch_size=1)`
+
+# 输出格式
+- 按照 "Thought (思考) -> Action (行动) -> Observation (观察) -> Final Answer (最终回答)" 模式呈现结果。
+- 语言表达专业、清晰，对每个步骤的描述准确明了。
+- 若使用工具，需明确写出工具名称及具体参数。
+```
+'''
diff --git a/02-use-cases/data_analysis_with_datalake/requirements.txt b/02-use-cases/data_analysis_with_datalake/requirements.txt
@@ -0,0 +1,9 @@
+veadk-python
+veadk-python[extensions]
+google-adk
+python-dotenv
+lancedb
+agentkit-sdk-python
+volcengine-python-sdk[ark]
+pyarrow
+duckdb
diff --git a/02-use-cases/data_analysis_with_datalake/settings.txt b/02-use-cases/data_analysis_with_datalake/settings.txt
@@ -0,0 +1 @@
+# Ark (OpenAI compatible)
diff --git a/02-use-cases/data_analysis_with_datalake/tools/__init__.py b/02-use-cases/data_analysis_with_datalake/tools/__init__.py
@@ -0,0 +1,2 @@
+# Package marker for tools modules
+
diff --git a/02-use-cases/data_analysis_with_datalake/tools/catalog_discovery.py b/02-use-cases/data_analysis_with_datalake/tools/catalog_discovery.py
@@ -0,0 +1,51 @@
+import os
+import json
+
+from rich.console import Console
+
+# Import the LanceDBManager singleton
+from .lancedb_manager import lancedb_manager
+# Import utility functions
+from .utils import get_text_embedding as get_embedding
+
+console = Console()
+
+def catalog_discovery(query_intent: str) -> str:
+    """Search metadata using vector similarity based on the user's intent keywords."""
+    console.print(f"[catalog_discovery] Inputs: query_intent={query_intent!r}")
+
+    if not query_intent:
+        return json.dumps({
+            "status": "error",
+            "error": "Query intent is empty. Please provide a keyword to search."
+        })
+
+    tbl, error_msg = lancedb_manager.get_metadata_table()
+    if error_msg:
+        return json.dumps({"error": error_msg})
+
+    try:
+        # 调用方舟获取query condition的向量
+        query_vector, emb_err = get_embedding(query_intent)
+        if emb_err:
+            return json.dumps({"error": emb_err})
+
+        # 调用Lance进行检索
+        results_df = tbl.search(query_vector, vector_column_name="vector").limit(10).to_pandas()
+        records = results_df.to_dict("records")
+
+        # Remove the vector column from the records before returning to the agent
+        for record in records:
+            record.pop("vector", None)
+
+        console.print(f"✅ 检索到 {len(records)} 条相关元数据")
+        return json.dumps({
+            "status": "ok",
+            "records": records,
+            "meta": {"row_count": len(records)},
+            "echo": {"query_intent": query_intent}
+        })
+    except Exception as e:
+        error_msg = f"❌ 检索失败: {e}"
+        console.print(f"[red]{error_msg}[/red]")
+        return json.dumps({"status": "error", "error": error_msg})
diff --git a/02-use-cases/data_analysis_with_datalake/tools/duckdb_sql_execution.py b/02-use-cases/data_analysis_with_datalake/tools/duckdb_sql_execution.py
@@ -0,0 +1,61 @@
+import os
+import json
+
+from rich.console import Console
+
+# Import the LanceDBManager singleton
+from .lancedb_manager import lancedb_manager
+
+console = Console()
+
+def duckdb_sql_execution(sql: str, user_question: str = "") -> str:
+    """Execute structured SQL via DuckDB on Lance table contents.
+
+    Expect sql to be a direct SQL string:
+    "SELECT ..."
+    """
+    console.print(f"[duckdb_sql_execution] Inputs: sql={sql!r}, user_question={user_question!r}")
+    if not sql or not isinstance(sql, str):
+        return json.dumps({"error": "SQL 字符串缺失或类型错误"}, ensure_ascii=False)
+
+    # Open the table using the LanceDBManager
+    tbl, err = lancedb_manager.open_table()
+    if err:
+        return json.dumps({"error": err}, ensure_ascii=False)
+    
+    view_name = "imdb_top_1000"
+
+    # Register Arrow/Pandas to DuckDB
+    conn = lancedb_manager.get_duckdb_connection()
+    try:
+        arrow_tbl = tbl.to_arrow()
+        conn.register(view_name, arrow_tbl)
+    except Exception:
+        df = tbl.to_pandas()
+        conn.register(view_name, df)
+
+    # Execute SQL
+    try:
+        out_df = conn.execute(sql).fetchdf()
+    except Exception as e:
+        return json.dumps({"error": f"DuckDB 执行失败: {e}"}, ensure_ascii=False)
+
+    # 构造 records（对象数组），并提供结构化响应
+    header = [str(c) for c in out_df.columns]
+    records_obj = out_df.to_dict(orient="records")
+
+    records = out_df.values.tolist()
+    try:
+        console.print(f"[sql] Returned rows: {len(records)} from table='{view_name}'")
+    except Exception:
+        pass
+    result = {
+        "status": "ok",
+        "data": [header] + records,
+        "records": records_obj,
+        "meta": {
+            "row_count": len(records),
+            "table": view_name,
+        }
+    }
+    return json.dumps(result, ensure_ascii=False)
diff --git a/02-use-cases/data_analysis_with_datalake/tools/lancedb_hybrid_execution.py b/02-use-cases/data_analysis_with_datalake/tools/lancedb_hybrid_execution.py
diff --git a/02-use-cases/data_analysis_with_datalake/tools/lancedb_manager.py b/02-use-cases/data_analysis_with_datalake/tools/lancedb_manager.py
diff --git a/02-use-cases/data_analysis_with_datalake/tools/utils.py b/02-use-cases/data_analysis_with_datalake/tools/utils.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# Package marker for tools modules`
	`2`	`+`