Skip to content

Commit 21b0107

Browse files
committed
feat: add bocha search
1 parent a32fe81 commit 21b0107

File tree

4 files changed

+250
-1
lines changed

4 files changed

+250
-1
lines changed
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
"""
2+
Example: Using InternetRetrieverFactory with BochaAISearchRetriever
3+
"""
4+
5+
from memos.configs.embedder import EmbedderConfigFactory
6+
from memos.configs.internet_retriever import InternetRetrieverConfigFactory
7+
from memos.embedders.factory import EmbedderFactory
8+
from memos.memories.textual.tree_text_memory.retrieve.internet_retriever_factory import (
9+
InternetRetrieverFactory,
10+
)
11+
12+
13+
# ========= 1. Create an embedder =========
14+
embedder_config = EmbedderConfigFactory.model_validate(
15+
{
16+
"backend": "ollama", # Or "sentence_transformer", etc.
17+
"config": {
18+
"model_name_or_path": "nomic-embed-text:latest",
19+
},
20+
}
21+
)
22+
embedder = EmbedderFactory.from_config(embedder_config)
23+
24+
# ========= 2. Create retriever config for BochaAI =========
25+
retriever_config = InternetRetrieverConfigFactory.model_validate(
26+
{
27+
"backend": "bocha",
28+
"config": {
29+
"api_key": "sk-xxxx", # 🔑 Your BochaAI API Key
30+
"search_engine_id": "", # Not required for BochaAI, but field exists for API consistency
31+
"max_results": 5,
32+
"reader": { # Reader config for chunking web content
33+
"backend": "simple",
34+
"config": {},
35+
},
36+
},
37+
}
38+
)
39+
40+
# ========= 3. Build retriever instance via factory =========
41+
retriever = InternetRetrieverFactory.from_config(retriever_config, embedder)
42+
43+
# ========= 4. Run BochaAI Web Search =========
44+
print("=== Scenario 1: Web Search (BochaAI) ===")
45+
query_web = "Alibaba 2024 ESG report"
46+
results_web = retriever.retrieve_from_web(query_web)
47+
48+
print(f"Retrieved {len(results_web)} memory items.")
49+
for idx, item in enumerate(results_web, 1):
50+
print(f"[{idx}] {item.memory[:100]}...") # preview first 100 chars
51+
52+
print("==" * 20)
53+
54+
# ========= 5. Run BochaAI AI Search =========
55+
print("=== Scenario 2: AI Search (BochaAI) ===")
56+
query_ai = "Weather in Beijing"
57+
results_ai = retriever.retrieve_from_ai(query_ai)
58+
59+
print(f"Retrieved {len(results_ai)} memory items.")
60+
for idx, item in enumerate(results_ai, 1):
61+
print(f"[{idx}] {item.memory[:100]}...")
62+
63+
print("==" * 20)

src/memos/configs/internet_retriever.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ class XinyuSearchConfig(BaseInternetRetrieverConfig):
5555
)
5656

5757

58+
class BochaSearchConfig(XinyuSearchConfig):
59+
"""Configuration class for Bocha Search API."""
60+
61+
5862
class InternetRetrieverConfigFactory(BaseConfig):
5963
"""Factory class for creating internet retriever configurations."""
6064

@@ -69,6 +73,7 @@ class InternetRetrieverConfigFactory(BaseConfig):
6973
"google": GoogleCustomSearchConfig,
7074
"bing": BingSearchConfig,
7175
"xinyu": XinyuSearchConfig,
76+
"bocha": BochaSearchConfig,
7277
}
7378

7479
@field_validator("backend")
Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
"""BochaAI Search API retriever for tree text memory."""
2+
3+
from concurrent.futures import ThreadPoolExecutor, as_completed
4+
from datetime import datetime
5+
6+
import requests
7+
8+
from memos.embedders.factory import OllamaEmbedder
9+
from memos.log import get_logger
10+
from memos.mem_reader.base import BaseMemReader
11+
from memos.memories.textual.item import TextualMemoryItem
12+
13+
14+
logger = get_logger(__name__)
15+
16+
17+
class BochaAISearchAPI:
18+
"""BochaAI Search API Client"""
19+
20+
def __init__(self, api_key: str, max_results: int = 20):
21+
"""
22+
Initialize BochaAI Search API client.
23+
24+
Args:
25+
api_key: BochaAI API key
26+
max_results: Maximum number of search results to retrieve
27+
"""
28+
self.api_key = api_key
29+
self.max_results = max_results
30+
31+
self.web_url = "https://api.bochaai.com/v1/web-search"
32+
self.ai_url = "https://api.bochaai.com/v1/ai-search"
33+
34+
self.headers = {
35+
"Authorization": f"Bearer {api_key}",
36+
"Content-Type": "application/json",
37+
}
38+
39+
def search_web(self, query: str, summary: bool = True, freshness="noLimit") -> list[dict]:
40+
"""
41+
Perform a Web Search (equivalent to the first curl).
42+
43+
Args:
44+
query: Search query string
45+
summary: Whether to include summary in the results
46+
freshness: Freshness filter (e.g. 'noLimit', 'day', 'week')
47+
48+
Returns:
49+
A list of search result dicts
50+
"""
51+
body = {
52+
"query": query,
53+
"summary": summary,
54+
"freshness": freshness,
55+
"count": self.max_results,
56+
}
57+
return self._post(self.web_url, body)
58+
59+
def search_ai(
60+
self, query: str, answer: bool = False, stream: bool = False, freshness="noLimit"
61+
) -> list[dict]:
62+
"""
63+
Perform an AI Search (equivalent to the second curl).
64+
65+
Args:
66+
query: Search query string
67+
answer: Whether BochaAI should generate an answer
68+
stream: Whether to use streaming response
69+
freshness: Freshness filter (e.g. 'noLimit', 'day', 'week')
70+
71+
Returns:
72+
A list of search result dicts
73+
"""
74+
body = {
75+
"query": query,
76+
"freshness": freshness,
77+
"count": self.max_results,
78+
"answer": answer,
79+
"stream": stream,
80+
}
81+
return self._post(self.ai_url, body)
82+
83+
def _post(self, url: str, body: dict) -> list[dict]:
84+
"""Helper method to send POST request and return JSON results."""
85+
try:
86+
resp = requests.post(url, headers=self.headers, json=body)
87+
resp.raise_for_status()
88+
data = resp.json()
89+
return data.get("results", [])
90+
except Exception:
91+
import traceback
92+
93+
logger.error(f"BochaAI search error: {traceback.format_exc()}")
94+
return []
95+
96+
97+
class BochaAISearchRetriever:
98+
"""BochaAI retriever that converts search results into TextualMemoryItem objects"""
99+
100+
def __init__(
101+
self, api_key: str, embedder: OllamaEmbedder, reader: BaseMemReader, max_results: int = 20
102+
):
103+
"""
104+
Initialize BochaAI Search retriever.
105+
106+
Args:
107+
api_key: BochaAI API key
108+
embedder: Embedder instance for generating embeddings
109+
reader: MemReader instance for processing internet content
110+
max_results: Maximum number of search results to retrieve
111+
"""
112+
self.bocha_api = BochaAISearchAPI(api_key, max_results=max_results)
113+
self.embedder = embedder
114+
self.reader = reader
115+
116+
def retrieve_from_web(
117+
self, query: str, top_k: int = 10, parsed_goal=None, info=None
118+
) -> list[TextualMemoryItem]:
119+
"""Retrieve information using BochaAI Web Search."""
120+
search_results = self.bocha_api.search_web(query)
121+
return self._convert_to_mem_items(search_results, query, parsed_goal, info)
122+
123+
def retrieve_from_ai(
124+
self, query: str, top_k: int = 10, parsed_goal=None, info=None
125+
) -> list[TextualMemoryItem]:
126+
"""Retrieve information using BochaAI AI Search."""
127+
search_results = self.bocha_api.search_ai(query)
128+
return self._convert_to_mem_items(search_results, query, parsed_goal, info)
129+
130+
def _convert_to_mem_items(
131+
self, search_results: list[dict], query: str, parsed_goal=None, info=None
132+
):
133+
"""Convert API search results into TextualMemoryItem objects."""
134+
memory_items = []
135+
if not info:
136+
info = {"user_id": "", "session_id": ""}
137+
138+
with ThreadPoolExecutor(max_workers=8) as executor:
139+
futures = [
140+
executor.submit(self._process_result, r, query, parsed_goal, info)
141+
for r in search_results
142+
]
143+
for future in as_completed(futures):
144+
try:
145+
memory_items.extend(future.result())
146+
except Exception as e:
147+
logger.error(f"Error processing BochaAI search result: {e}")
148+
149+
# Deduplicate items by memory text
150+
unique_memory_items = {item.memory: item for item in memory_items}
151+
return list(unique_memory_items.values())
152+
153+
def _process_result(
154+
self, result: dict, query: str, parsed_goal: str, info: None
155+
) -> list[TextualMemoryItem]:
156+
"""Process a single result into one or more TextualMemoryItems."""
157+
title = result.get("title", "")
158+
content = result.get("content", "")
159+
summary = result.get("summary", "")
160+
url = result.get("url", "")
161+
publish_time = datetime.now().strftime(
162+
"%Y-%m-%d"
163+
) # Optional: can map to API field if exists
164+
165+
# Use reader to split and process the content into chunks
166+
read_items = self.reader.get_memory([content], type="doc", info=info)
167+
168+
memory_items = []
169+
for read_item_i in read_items[0]:
170+
read_item_i.memory = (
171+
f"Title: {title}\nNewsTime: {publish_time}\nSummary: {summary}\n"
172+
f"Content: {read_item_i.memory}"
173+
)
174+
read_item_i.metadata.source = "web"
175+
read_item_i.metadata.memory_type = "OuterMemory"
176+
read_item_i.metadata.sources = [url] if url else []
177+
read_item_i.metadata.visibility = "public"
178+
memory_items.append(read_item_i)
179+
return memory_items

src/memos/memories/textual/tree_text_memory/retrieve/internet_retriever_factory.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from memos.configs.internet_retriever import InternetRetrieverConfigFactory
66
from memos.embedders.base import BaseEmbedder
77
from memos.mem_reader.factory import MemReaderFactory
8+
from memos.memories.textual.tree_text_memory.retrieve.bochasearch import BochaAISearchRetriever
89
from memos.memories.textual.tree_text_memory.retrieve.internet_retriever import (
910
InternetGoogleRetriever,
1011
)
@@ -18,6 +19,7 @@ class InternetRetrieverFactory:
1819
"google": InternetGoogleRetriever,
1920
"bing": InternetGoogleRetriever, # TODO: Implement BingRetriever
2021
"xinyu": XinyuSearchRetriever,
22+
"bocha": BochaAISearchRetriever,
2123
}
2224

2325
@classmethod
@@ -62,7 +64,7 @@ def from_config(
6264
max_results=config.max_results,
6365
num_per_request=config.num_per_request,
6466
)
65-
elif backend == "xinyu":
67+
elif backend == "xinyu" or backend == "bocha":
6668
return retriever_class(
6769
access_key=config.api_key, # Use api_key as access_key for xinyu
6870
search_engine_id=config.search_engine_id,

0 commit comments

Comments
 (0)