Skip to content

Commit 222579b

Browse files
committed
Refactor get_sources_content to return DataPoints
1 parent c118875 commit 222579b

File tree

3 files changed

+25
-19
lines changed

3 files changed

+25
-19
lines changed

app/backend/approaches/approach.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -340,15 +340,17 @@ async def get_sources_content(
340340
use_semantic_captions: bool,
341341
download_image_sources: bool,
342342
user_oid: Optional[str] = None,
343-
) -> tuple[list[str], list[str], list[str]]:
344-
"""
345-
Extracts text and image sources from the search results.
346-
If use_semantic_captions is True, it will use the captions from the results.
347-
If use_image_sources is True, it will extract image URLs from the results.
343+
) -> DataPoints:
344+
"""Extract text/image sources & citations from documents.
345+
346+
Args:
347+
results: List of retrieved Document objects.
348+
use_semantic_captions: Whether to use semantic captions instead of full content text.
349+
download_image_sources: Whether to attempt downloading & base64 encoding referenced images.
350+
user_oid: Optional user object id for per-user storage access (ADLS scenarios).
351+
348352
Returns:
349-
- A list of text sources (captions or content).
350-
- A list of image sources (base64 encoded).
351-
- A list of allowed citations for those sources.
353+
DataPoints: with text (list[str]), images (list[str - base64 data URI]), citations (list[str]).
352354
"""
353355

354356
def nonewlines(s: str) -> str:
@@ -380,8 +382,9 @@ def nonewlines(s: str) -> str:
380382
if url:
381383
image_sources.append(url)
382384
citations.append(self.get_image_citation(doc.sourcepage or "", img["url"]))
383-
384-
return text_sources, image_sources, citations
385+
if download_image_sources:
386+
return DataPoints(text=text_sources, images=image_sources, citations=citations)
387+
return DataPoints(text=text_sources, citations=citations)
385388

386389
def get_citation(self, sourcepage: Optional[str]):
387390
return sourcepage or ""

app/backend/approaches/chatreadretrieveread.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -339,12 +339,14 @@ async def run_search_approach(
339339
)
340340

341341
# STEP 3: Generate a contextual and content specific answer using the search results and chat history
342-
text_sources, image_sources, citations = await self.get_sources_content(
342+
data_points = await self.get_sources_content(
343343
results, use_semantic_captions, download_image_sources=send_image_sources, user_oid=auth_claims.get("oid")
344344
)
345+
if not send_text_sources:
346+
data_points = DataPoints(text=[], images=data_points.images, citations=data_points.citations)
345347

346348
extra_info = ExtraInfo(
347-
DataPoints(text=text_sources if send_text_sources else [], images=image_sources, citations=citations),
349+
data_points,
348350
thoughts=[
349351
self.format_thought_step_for_chatcompletion(
350352
title="Prompt to generate search query",
@@ -405,15 +407,17 @@ async def run_agentic_retrieval_approach(
405407
results_merge_strategy=results_merge_strategy,
406408
)
407409

408-
text_sources, image_sources, citations = await self.get_sources_content(
410+
data_points = await self.get_sources_content(
409411
results,
410412
use_semantic_captions=False,
411413
download_image_sources=send_image_sources,
412414
user_oid=auth_claims.get("oid"),
413415
)
416+
if not send_text_sources:
417+
data_points = DataPoints(text=[], images=data_points.images, citations=data_points.citations)
414418

415419
extra_info = ExtraInfo(
416-
DataPoints(text=text_sources if send_text_sources else [], images=image_sources, citations=citations),
420+
data_points,
417421
thoughts=[
418422
ThoughtStep(
419423
"Use agentic retrieval",

app/backend/approaches/retrievethenread.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88

99
from approaches.approach import (
1010
Approach,
11-
DataPoints,
1211
ExtraInfo,
1312
ThoughtStep,
1413
)
@@ -183,12 +182,12 @@ async def run_search_approach(
183182
use_query_rewriting,
184183
)
185184

186-
text_sources, image_sources, citations = await self.get_sources_content(
185+
data_points = await self.get_sources_content(
187186
results, use_semantic_captions, download_image_sources=send_image_sources, user_oid=auth_claims.get("oid")
188187
)
189188

190189
return ExtraInfo(
191-
DataPoints(text=text_sources, images=image_sources, citations=citations),
190+
data_points,
192191
thoughts=[
193192
ThoughtStep(
194193
"Search using user query",
@@ -238,15 +237,15 @@ async def run_agentic_retrieval_approach(
238237
results_merge_strategy=results_merge_strategy,
239238
)
240239

241-
text_sources, image_sources, citations = await self.get_sources_content(
240+
data_points = await self.get_sources_content(
242241
results,
243242
use_semantic_captions=False,
244243
download_image_sources=send_image_sources,
245244
user_oid=auth_claims.get("oid"),
246245
)
247246

248247
extra_info = ExtraInfo(
249-
DataPoints(text=text_sources, images=image_sources, citations=citations),
248+
data_points,
250249
thoughts=[
251250
ThoughtStep(
252251
"Use agentic retrieval",

0 commit comments

Comments
 (0)