Skip to content

Commit 0ed7bbf

Browse files
[search online] 优化联网搜索插件性能 (#490)
* [search online] 优化联网搜索插件性能 * [search online] 修改检视意见
1 parent 8a84db5 commit 0ed7bbf

File tree

2 files changed

+180
-89
lines changed

2 files changed

+180
-89
lines changed

app-builder/plugins/fit_py_internet_search/conf/application.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
internet-search:
22
max_results_per_provider: 5
3+
summary-length: 150
34
api-key:
45
exa: "https://dashboard.exa.ai/home -- 登录获取api key"
56
tavily: "https://app.tavily.com/home -- 登录获取api key"

app-builder/plugins/fit_py_internet_search/src/internet_search.py

Lines changed: 179 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
# Licensed under the MIT License. See License.txt in the project root for license information.
55
# ======================================================================================================================
66
import json
7+
import re
8+
from concurrent.futures import ThreadPoolExecutor, as_completed
79
from dataclasses import dataclass
810
from typing import Dict, List, Optional, Sequence
911
from linkup import LinkupClient
@@ -69,12 +71,158 @@ def _get_max_results_per_provider() -> int:
6971
pass
7072

7173

74+
@value('internet-search.summary-length')
75+
def _get_max_summary_length() -> int:
76+
pass
77+
78+
7279
def _truncate(text: str, max_chars: int) -> str:
7380
if len(text) <= max_chars:
7481
return text
7582
return text[: max_chars - 1].rstrip() + "…"
7683

7784

85+
def _extract_summary(text: str, max_sentences: int = 4) -> str:
86+
"""
87+
从文本中提取前几句话作为摘要
88+
89+
Args:
90+
text: 原始文本
91+
max_sentences: 最多保留的句子数,默认为4句
92+
93+
Returns:
94+
摘要文本
95+
"""
96+
if not text:
97+
return ""
98+
99+
# 使用正则表达式匹配句子结束符号
100+
sentences = re.split(r'([。!?\.!?]+["\'»\)]?\s*)', text)
101+
102+
# 重新组合句子(将分隔符和句子内容合并)
103+
combined_sentences = []
104+
for i in range(0, len(sentences) - 1, 2):
105+
sentence = sentences[i]
106+
separator = sentences[i + 1] if i + 1 < len(sentences) else ""
107+
combined = (sentence + separator).strip()
108+
if combined:
109+
combined_sentences.append(combined)
110+
111+
# 如果最后一个元素没有分隔符
112+
if len(sentences) % 2 == 1 and sentences[-1].strip():
113+
combined_sentences.append(sentences[-1].strip())
114+
115+
# 取前 max_sentences 句
116+
if len(combined_sentences) <= max_sentences:
117+
summary = " ".join(combined_sentences)
118+
else:
119+
summary = " ".join(combined_sentences[:max_sentences])
120+
121+
# 确保摘要不会过长(最多150字符)
122+
if len(summary) > _get_max_summary_length():
123+
summary = summary[:(_get_max_summary_length() - 3)].rstrip() + "..."
124+
125+
return summary
126+
127+
128+
def _search_exa(query: str, api_key: str, max_results: int, max_snippet_chars: int) -> List[SearchItem]:
129+
"""在 Exa 中搜索"""
130+
items: List[SearchItem] = []
131+
try:
132+
exa_client = Exa(api_key=api_key)
133+
res = exa_client.search_and_contents(
134+
query,
135+
text={"max_characters": 2000},
136+
livecrawl="always",
137+
num_results=max_results,
138+
)
139+
for i, r in enumerate(getattr(res, "results", [])[:max_results]):
140+
text = _truncate(getattr(r, "text", "") or getattr(r, "content", "") or "", max_snippet_chars)
141+
summary = _extract_summary(text) # 提取3-4句话作为摘要
142+
items.append(
143+
SearchItem(
144+
id=getattr(r, "id", "") or f"exa_{i}",
145+
text=summary,
146+
score=12.0,
147+
metadata={
148+
"fileName": getattr(r, "title", "") or "",
149+
"url": getattr(r, "url", "") or "",
150+
"source": "exa",
151+
"published_date": getattr(r, "published_date", None),
152+
"summary": summary,
153+
}
154+
)
155+
)
156+
except Exception as e:
157+
sys_plugin_logger.warning(f'Failed to search in Exa tool: {str(e)}')
158+
return items
159+
160+
161+
def _search_tavily(query: str, api_key: str, max_results: int, max_snippet_chars: int) -> List[SearchItem]:
162+
"""在 Tavily 中搜索"""
163+
items: List[SearchItem] = []
164+
try:
165+
tavily_client = TavilyClient(api_key=api_key)
166+
res = tavily_client.search(
167+
query=query,
168+
max_results=max_results,
169+
include_images=False,
170+
)
171+
for i, r in enumerate(res.get("results", [])[:max_results]):
172+
text = _truncate(r.get("content", "") or "", max_snippet_chars)
173+
summary = _extract_summary(text) # 提取3-4句话作为摘要
174+
items.append(
175+
SearchItem(
176+
id=r.get("id", "") or f"tavily_{i}",
177+
text=summary,
178+
score=12.0,
179+
metadata={
180+
"fileName": r.get("title", "") or "",
181+
"url": r.get("url", "") or "",
182+
"source": "tavily",
183+
"published_date": r.get("published_date"),
184+
"summary": summary,
185+
}
186+
)
187+
)
188+
except Exception as e:
189+
sys_plugin_logger.warning(f'Failed to search in Tavily tool: {str(e)}')
190+
return items
191+
192+
193+
def _search_linkup(query: str, api_key: str, max_results: int, max_snippet_chars: int) -> List[SearchItem]:
194+
"""在 Linkup 中搜索"""
195+
items: List[SearchItem] = []
196+
try:
197+
linkup_client = LinkupClient(api_key=api_key)
198+
resp = linkup_client.search(
199+
query=query,
200+
depth="standard",
201+
output_type="searchResults",
202+
include_images=False,
203+
)
204+
for i, r in enumerate(getattr(resp, "results", [])[:max_results]):
205+
text = _truncate(getattr(r, "content", "") or getattr(r, "text", "") or "", max_snippet_chars)
206+
summary = _extract_summary(text) # 提取3-4句话作为摘要
207+
items.append(
208+
SearchItem(
209+
id=getattr(r, "id", "") or f"linkup_{i}",
210+
text=summary,
211+
score=12.0,
212+
metadata={
213+
"fileName": getattr(r, "name", None) or getattr(r, "title", "") or "",
214+
"url": getattr(r, "url", "") or "",
215+
"source": "linkup",
216+
"published_date": None,
217+
"summary": summary,
218+
}
219+
)
220+
)
221+
except Exception as e:
222+
sys_plugin_logger.warning(f'Failed to search in Linkup tool: {str(e)}')
223+
return items
224+
225+
78226
def _internet_search(
79227
query: str,
80228
api_keys: Dict[str, str],
@@ -88,102 +236,44 @@ def _internet_search(
88236
for name in ("exa", "tavily", "linkup"):
89237
if api_keys.get(name):
90238
selected.append(name)
91-
items: List[SearchItem] = []
92-
errors = [] # 记录失败的搜索工具
93239

94-
# Exa
240+
# 准备并行搜索任务
241+
search_tasks = []
95242
if "exa" in selected and api_keys.get("exa"):
96-
try:
97-
exa_client = Exa(api_key=api_keys["exa"])
98-
res = exa_client.search_and_contents(
99-
query,
100-
text={"max_characters": 2000},
101-
livecrawl="always",
102-
num_results=max_results_per_provider,
103-
)
104-
for i, r in enumerate(getattr(res, "results", [])[:max_results_per_provider]):
105-
text = _truncate(getattr(r, "text", "") or getattr(r, "content", "") or "", max_snippet_chars)
106-
items.append(
107-
SearchItem(
108-
id=getattr(r, "id", "") or f"exa_{i}",
109-
text=text,
110-
score=12.0, # 使用float确保序列化
111-
metadata={
112-
"fileName": getattr(r, "title", "") or "",
113-
"url": getattr(r, "url", "") or "",
114-
"source": "exa",
115-
"published_date": getattr(r, "published_date", None),
116-
"summary": text,
117-
}
118-
)
119-
)
120-
except Exception as e:
121-
sys_plugin_logger.warning(f'Failed to search in Exa tool: {str(e)}')
122-
errors.append("exa")
123-
124-
# Tavily
243+
search_tasks.append(("exa", _search_exa, api_keys["exa"]))
125244
if "tavily" in selected and api_keys.get("tavily"):
126-
try:
127-
tavily_client = TavilyClient(api_key=api_keys["tavily"])
128-
res = tavily_client.search(
129-
query=query,
130-
max_results=max_results_per_provider,
131-
include_images=False,
132-
)
133-
for i, r in enumerate(res.get("results", [])[:max_results_per_provider]):
134-
text = _truncate(r.get("content", "") or "", max_snippet_chars)
135-
items.append(
136-
SearchItem(
137-
id=r.get("id", "") or f"tavily_{i}",
138-
text=text,
139-
score=12.0,
140-
metadata={
141-
"fileName": r.get("title", "") or "",
142-
"url": r.get("url", "") or "",
143-
"source": "tavily",
144-
"published_date": r.get("published_date"),
145-
"summary": text,
146-
}
147-
)
148-
)
149-
except Exception as e:
150-
sys_plugin_logger.warning(f'Failed to search in Tavily tool: {str(e)}')
151-
errors.append("tavily")
152-
153-
# Linkup
245+
search_tasks.append(("tavily", _search_tavily, api_keys["tavily"]))
154246
if "linkup" in selected and api_keys.get("linkup"):
155-
try:
156-
linkup_client = LinkupClient(api_key=api_keys["linkup"])
157-
resp = linkup_client.search(
158-
query=query,
159-
depth="standard",
160-
output_type="searchResults",
161-
include_images=False,
162-
)
163-
for i, r in enumerate(getattr(resp, "results", [])[:max_results_per_provider]):
164-
text = _truncate(getattr(r, "content", "") or getattr(r, "text", "") or "", max_snippet_chars)
165-
items.append(
166-
SearchItem(
167-
id=getattr(r, "id", "") or f"linkup_{i}",
168-
text=text,
169-
score=12.0,
170-
metadata={
171-
"fileName": getattr(r, "name", None) or getattr(r, "title", "") or "",
172-
"url": getattr(r, "url", "") or "",
173-
"source": "linkup",
174-
"published_date": None,
175-
"summary": text,
176-
}
177-
)
178-
)
179-
except Exception as e:
180-
sys_plugin_logger.warning(f'Failed to search in Linkup tool: {str(e)}')
181-
errors.append("linkup")
182-
247+
search_tasks.append(("linkup", _search_linkup, api_keys["linkup"]))
248+
249+
# 使用线程池并行执行搜索
250+
items: List[SearchItem] = []
251+
errors = []
252+
253+
with ThreadPoolExecutor(max_workers=len(search_tasks)) as executor:
254+
# 提交所有搜索任务
255+
future_to_provider = {
256+
executor.submit(task_func, query, api_key, max_results_per_provider, max_snippet_chars): provider_name
257+
for provider_name, task_func, api_key in search_tasks
258+
}
259+
260+
# 收集结果
261+
for future in as_completed(future_to_provider):
262+
provider_name = future_to_provider[future]
263+
try:
264+
results = future.result()
265+
if results:
266+
items.extend(results)
267+
else:
268+
errors.append(provider_name)
269+
except Exception as e:
270+
sys_plugin_logger.error(f'Unexpected error in {provider_name} search: {str(e)}')
271+
errors.append(provider_name)
272+
183273
# 如果所有搜索都失败了,才抛出异常
184274
if not items and errors:
185275
raise FitException(
186-
InternalErrorCode.CLIENT_ERROR,
276+
InternalErrorCode.CLIENT_ERROR,
187277
f'All search tools failed: {", ".join(errors)}'
188278
)
189279

0 commit comments

Comments
 (0)