Skip to content

Commit a2a5fea

Browse files
committed
✨(websearch) add Brave llm/context snippets
Use llm/context endpoint with snippets, change tool name for web_search Signed-off-by: camilleAND <camille.andre@modernisation.gouv.fr>
1 parent 6dd41e8 commit a2a5fea

File tree

6 files changed

+89
-13
lines changed

6 files changed

+89
-13
lines changed

src/backend/chat/agents/conversation.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,8 @@ def get_web_search_tool_name(self) -> str | None:
128128
"""
129129
for toolset in self.toolsets:
130130
for tool in toolset.tools.values():
131-
if tool.name.startswith("web_search_"):
131+
# Support both legacy names (web_search_*) and the new generic "web_search"
132+
if tool.name == "web_search" or tool.name.startswith("web_search_"):
132133
return tool.name
133134
return None
134135

src/backend/chat/tests/clients/pydantic_ai/test_smart_web_search.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def _llm_config_with_websearch(settings):
2323
is_active=True,
2424
icon=None,
2525
system_prompt="You are an amazing assistant.",
26-
tools=["web_search_brave_with_document_backend"],
26+
tools=["web_search"],
2727
provider=LLMProvider(
2828
hrid="unused",
2929
base_url="https://example.com",
@@ -68,7 +68,7 @@ def test_smart_search_enabled_tool_is_called(_llm_config_with_websearch):
6868
with service.conversation_agent.override(model=TestModel(), deps=service._context_deps):
6969
response = service.conversation_agent.run_sync("Search the web for something.")
7070

71-
assert "web_search_brave_with_document_backend" in response.output
71+
assert "web_search" in response.output
7272

7373

7474
def test_force_websearch_overrides_smart_search_disabled(_llm_config_with_websearch):
@@ -92,4 +92,4 @@ def test_force_websearch_overrides_smart_search_disabled(_llm_config_with_websea
9292
)
9393
with service.conversation_agent.override(model=TestModel(), deps=service._context_deps):
9494
response = service.conversation_agent.run_sync("Search the web for something.")
95-
assert "web_search_brave_with_document_backend" in response.output
95+
assert "web_search" in response.output

src/backend/chat/tests/tools/test_web_search_brave.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@
2727
web_search_brave_with_document_backend,
2828
)
2929

30-
BRAVE_URL = "https://api.search.brave.com/res/v1/web/search"
30+
# Must match the URL used in _query_brave_api_async
31+
BRAVE_URL = "https://api.search.brave.com/res/v1/llm/context"
3132

3233

3334
@pytest.fixture(autouse=True)

src/backend/chat/tools/__init__.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,17 @@ def get_pydantic_tools_by_name(name: str) -> Tool:
2323
prepare=only_if_web_search_enabled,
2424
max_retries=2,
2525
),
26+
# Backward-compatible alias (older settings may still reference this tool name).
2627
"web_search_brave_with_document_backend": Tool(
2728
web_search_brave_with_document_backend,
29+
name="web_search_brave_with_document_backend",
30+
takes_ctx=True,
31+
prepare=only_if_web_search_enabled,
32+
max_retries=2,
33+
),
34+
"web_search": Tool(
35+
web_search_brave_with_document_backend,
36+
name="web_search",
2837
takes_ctx=True,
2938
prepare=only_if_web_search_enabled,
3039
max_retries=2,

src/backend/chat/tools/web_search_brave.py

Lines changed: 56 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,18 @@ async def _fetch_and_extract_async(url: str) -> str:
101101
raise DocumentFetchError(f"Failed to extract content from {url}: {e}") from e
102102

103103

104+
def _get_snippets_from_result(result: dict) -> List[str]:
105+
"""Return merged snippets/extra_snippets as a list, guarding against None."""
106+
snippets = result.get("snippets") or []
107+
extra_snippets = result.get("extra_snippets") or []
108+
# Both are expected to be lists of strings; fall back to one or the other if needed.
109+
if snippets and not extra_snippets:
110+
return snippets
111+
if extra_snippets and not snippets:
112+
return extra_snippets
113+
return snippets or extra_snippets
114+
115+
104116
async def _extract_and_summarize_snippets_async(query: str, url: str) -> List[str]:
105117
"""Fetch, extract and summarize text content from the URL.
106118
@@ -144,7 +156,11 @@ async def _fetch_and_store_async(url: str, document_store, **kwargs) -> None:
144156

145157
async def _query_brave_api_async(query: str) -> List[dict]:
146158
"""Query the Brave Search API and return the raw results."""
147-
url = "https://api.search.brave.com/res/v1/web/search"
159+
# NOTE:
160+
# - Standard web search endpoint: https://api.search.brave.com/res/v1/web/search
161+
# - LLM context endpoint: https://api.search.brave.com/res/v1/llm/context
162+
# The latter returns results under `grounding.generic` instead of `web.results`.
163+
url = "https://api.search.brave.com/res/v1/llm/context"
148164
headers = {
149165
"Accept": "application/json",
150166
"X-Subscription-Token": settings.BRAVE_API_KEY,
@@ -158,6 +174,10 @@ async def _query_brave_api_async(query: str) -> List[dict]:
158174
"spellcheck": settings.BRAVE_SEARCH_SPELLCHECK,
159175
"result_filter": "web,faq,query",
160176
"extra_snippets": settings.BRAVE_SEARCH_EXTRA_SNIPPETS,
177+
"maximum_number_of_urls": settings.BRAVE_MAX_RESULTS,
178+
"maximum_number_of_tokens": settings.BRAVE_MAX_TOKENS,
179+
"maximum_number_of_snippets": settings.BRAVE_MAX_SNIPPETS,
180+
"maximum_number_of_snippets_per_url": settings.BRAVE_MAX_SNIPPETS_PER_URL,
161181
}
162182
params = {k: v for k, v in data.items() if v is not None}
163183

@@ -167,6 +187,29 @@ async def _query_brave_api_async(query: str) -> List[dict]:
167187
response.raise_for_status()
168188
json_response = response.json()
169189

190+
# LLM context API: results are under `grounding.generic`
191+
# See: https://api-dashboard.search.brave.com/documentation/services/llm-context
192+
if "grounding" in json_response:
193+
generic_results = json_response.get("grounding", {}).get("generic", []) or []
194+
normalized_results: List[dict] = []
195+
for item in generic_results:
196+
item_url = item.get("url")
197+
if not item_url:
198+
continue
199+
200+
normalized_results.append(
201+
{
202+
"url": item_url,
203+
# Fallback to URL if no title is provided
204+
"title": item.get("title") or item_url,
205+
# `snippets` is already a list
206+
"snippets": item.get("snippets") or [],
207+
}
208+
)
209+
210+
return normalized_results
211+
212+
# Fallback for classic web search JSON shape, if we ever switch back
170213
# https://api-dashboard.search.brave.com/app/documentation/web-search/responses#Result
171214
return json_response.get("web", {}).get("results", [])
172215

@@ -211,20 +254,22 @@ async def _query_brave_api_async(query: str) -> List[dict]:
211254

212255
def format_tool_return(raw_search_results: List[dict]) -> ToolReturn:
213256
"""Format the raw search results into a ToolReturn object."""
257+
logger.debug("Raw search results: %s", raw_search_results)
258+
logger.debug("Unduplicated sources: %s", {result["url"] for result in raw_search_results})
214259
return ToolReturn(
215260
# Format return value "mistral-like": https://docs.mistral.ai/capabilities/citations/
216261
return_value={
217262
str(idx): {
218263
"url": result["url"],
219264
"title": result["title"],
220-
"snippets": result.get("extra_snippets", []),
265+
"snippets": _get_snippets_from_result(result),
221266
}
222267
for idx, result in enumerate(raw_search_results)
223-
if result.get("extra_snippets", [])
268+
if _get_snippets_from_result(result)
224269
},
225270
metadata={
226271
"sources": {
227-
result["url"] for result in raw_search_results if result.get("extra_snippets", [])
272+
result["url"] for result in raw_search_results if _get_snippets_from_result(result)
228273
}
229274
},
230275
)
@@ -244,9 +289,12 @@ async def web_search_brave(_ctx: RunContext, query: str) -> ToolReturn:
244289

245290
await sync_to_async(reset_caches)() # Clear trafilatura caches to avoid memory bloat/leaks
246291

247-
# Parallelize fetch/extract for results that don't include extra_snippets
292+
# Parallelize fetch/extract only for results that don't already include any snippets
293+
# (neither Brave `snippets` nor `extra_snippets`).
248294
to_process = [
249-
(idx, r) for idx, r in enumerate(raw_search_results) if not r.get("extra_snippets")
295+
(idx, r)
296+
for idx, r in enumerate(raw_search_results)
297+
if not r.get("extra_snippets") and not r.get("snippets")
250298
]
251299

252300
if to_process:
@@ -292,7 +340,7 @@ async def web_search_brave_with_document_backend(ctx: RunContext, query: str) ->
292340
ctx (RunContext): The run context containing the conversation.
293341
query (str): The query to search for.
294342
"""
295-
logger.info("Starting web search with RAG backend for query: %s", query)
343+
logger.debug("Starting web search with RAG backend for query: %s", query)
296344
try:
297345
raw_search_results = await _query_brave_api_async(query)
298346

@@ -328,7 +376,7 @@ async def web_search_brave_with_document_backend(ctx: RunContext, query: str) ->
328376
session=ctx.deps.session,
329377
user_sub=ctx.deps.user.sub,
330378
)
331-
logger.info("RAG search returned: %s", rag_results)
379+
logger.debug("RAG search returned: %s", rag_results)
332380

333381
ctx.usage += RunUsage(
334382
input_tokens=rag_results.usage.prompt_tokens,

src/backend/conversations/brave_settings.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,3 +74,20 @@ class BraveSettings:
7474
environ_name="BRAVE_SEARCH_EXTRA_SNIPPETS",
7575
environ_prefix=None,
7676
)
77+
78+
# LLM context endpoint limits
79+
BRAVE_MAX_TOKENS = values.IntegerValue(
80+
default=8192,
81+
environ_name="BRAVE_MAX_TOKENS",
82+
environ_prefix=None,
83+
)
84+
BRAVE_MAX_SNIPPETS = values.IntegerValue(
85+
default=50,
86+
environ_name="BRAVE_MAX_SNIPPETS",
87+
environ_prefix=None,
88+
)
89+
BRAVE_MAX_SNIPPETS_PER_URL = values.IntegerValue(
90+
default=10,
91+
environ_name="BRAVE_MAX_SNIPPETS_PER_URL",
92+
environ_prefix=None,
93+
)

0 commit comments

Comments
 (0)