Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions app/backend/approaches/approach.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,7 @@ async def get_sources_content(
self,
results: list[Document],
use_semantic_captions: bool,
include_text_sources: bool,
download_image_sources: bool,
user_oid: Optional[str] = None,
) -> DataPoints:
Expand Down Expand Up @@ -442,10 +443,13 @@ def nonewlines(s: str) -> str:
citations.append(citation)

# If semantic captions are used, extract captions; otherwise, use content
if use_semantic_captions and doc.captions:
text_sources.append(f"{citation}: {nonewlines(' . '.join([cast(str, c.text) for c in doc.captions]))}")
else:
text_sources.append(f"{citation}: {nonewlines(doc.content or '')}")
if include_text_sources:
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The only change here is the very top line-

if use_semantic_captions and doc.captions:
text_sources.append(
f"{citation}: {nonewlines(' . '.join([cast(str, c.text) for c in doc.captions]))}"
)
else:
text_sources.append(f"{citation}: {nonewlines(doc.content or '')}")

if download_image_sources and hasattr(doc, "images") and doc.images:
for img in doc.images:
Expand All @@ -457,9 +461,7 @@ def nonewlines(s: str) -> str:
if url:
image_sources.append(url)
citations.append(self.get_image_citation(doc.sourcepage or "", img["url"]))
Copy link
Preview

Copilot AI Sep 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return statement always includes image_sources regardless of the download_image_sources parameter. When download_image_sources is False, image_sources should be empty or None to avoid returning potentially unintended image data.

Suggested change
citations.append(self.get_image_citation(doc.sourcepage or "", img["url"]))
citations.append(self.get_image_citation(doc.sourcepage or "", img["url"]))
if not download_image_sources:
image_sources = None

Copilot uses AI. Check for mistakes.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But it will be an empty list in that case? What's wrong with that?

if download_image_sources:
return DataPoints(text=text_sources, images=image_sources, citations=citations)
return DataPoints(text=text_sources, citations=citations)
return DataPoints(text=text_sources, images=image_sources, citations=citations)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes image_sources [] when not including image sources, which is consistent with text_sources being [] when not including text sources. Snapshots already aligned with this, and it simplified code.

Copy link
Preview

Copilot AI Sep 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return statement always includes image_sources even when download_image_sources is False. This should conditionally include images only when download_image_sources is True, otherwise pass None or an empty list for the images parameter.

Suggested change
return DataPoints(text=text_sources, images=image_sources, citations=citations)
return DataPoints(
text=text_sources,
images=image_sources if download_image_sources else None,
citations=citations
)

Copilot uses AI. Check for mistakes.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot But it will be an empty list in that case, since image_sources is only populated when download_image_sources is True?


def get_citation(self, sourcepage: Optional[str]):
return sourcepage or ""
Expand Down
22 changes: 11 additions & 11 deletions app/backend/approaches/chatreadretrieveread.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

from approaches.approach import (
Approach,
DataPoints,
ExtraInfo,
ThoughtStep,
)
Expand Down Expand Up @@ -284,9 +283,11 @@ async def run_search_approach(
minimum_reranker_score = overrides.get("minimum_reranker_score", 0.0)
search_index_filter = self.build_filter(overrides, auth_claims)
send_text_sources = overrides.get("send_text_sources", True)
send_image_sources = overrides.get("send_image_sources", True)
send_image_sources = overrides.get("send_image_sources", self.multimodal_enabled) and self.multimodal_enabled
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

great :)

search_text_embeddings = overrides.get("search_text_embeddings", True)
search_image_embeddings = overrides.get("search_image_embeddings", self.multimodal_enabled)
search_image_embeddings = (
overrides.get("search_image_embeddings", self.multimodal_enabled) and self.multimodal_enabled
)

original_user_query = messages[-1]["content"]
if not isinstance(original_user_query, str):
Expand Down Expand Up @@ -342,11 +343,12 @@ async def run_search_approach(

# STEP 3: Generate a contextual and content specific answer using the search results and chat history
data_points = await self.get_sources_content(
results, use_semantic_captions, download_image_sources=send_image_sources, user_oid=auth_claims.get("oid")
results,
use_semantic_captions,
include_text_sources=send_text_sources,
download_image_sources=send_image_sources,
user_oid=auth_claims.get("oid"),
)
if not send_text_sources:
data_points = DataPoints(text=[], images=data_points.images, citations=data_points.citations)

extra_info = ExtraInfo(
data_points,
thoughts=[
Expand Down Expand Up @@ -396,7 +398,7 @@ async def run_agentic_retrieval_approach(
# 50 is the amount of documents that the reranker can process per query
max_docs_for_reranker = max_subqueries * 50
send_text_sources = overrides.get("send_text_sources", True)
send_image_sources = overrides.get("send_image_sources", True)
send_image_sources = overrides.get("send_image_sources", self.multimodal_enabled) and self.multimodal_enabled

response, results = await self.run_agentic_retrieval(
messages=messages,
Expand All @@ -412,12 +414,10 @@ async def run_agentic_retrieval_approach(
data_points = await self.get_sources_content(
results,
use_semantic_captions=False,
include_text_sources=send_text_sources,
download_image_sources=send_image_sources,
user_oid=auth_claims.get("oid"),
)
if not send_text_sources:
data_points = DataPoints(text=[], images=data_points.images, citations=data_points.citations)

extra_info = ExtraInfo(
data_points,
thoughts=[
Expand Down
17 changes: 13 additions & 4 deletions app/backend/approaches/retrievethenread.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,9 +159,12 @@ async def run_search_approach(
minimum_reranker_score = overrides.get("minimum_reranker_score", 0.0)
filter = self.build_filter(overrides, auth_claims)
q = str(messages[-1]["content"])
send_image_sources = overrides.get("send_image_sources", True)
send_text_sources = overrides.get("send_text_sources", True)
send_image_sources = overrides.get("send_image_sources", self.multimodal_enabled) and self.multimodal_enabled
search_text_embeddings = overrides.get("search_text_embeddings", True)
search_image_embeddings = overrides.get("search_image_embeddings", self.multimodal_enabled)
search_image_embeddings = (
overrides.get("search_image_embeddings", self.multimodal_enabled) and self.multimodal_enabled
)

vectors: list[VectorQuery] = []
if use_vector_search:
Expand All @@ -185,7 +188,11 @@ async def run_search_approach(
)

data_points = await self.get_sources_content(
results, use_semantic_captions, download_image_sources=send_image_sources, user_oid=auth_claims.get("oid")
results,
use_semantic_captions,
include_text_sources=send_text_sources,
download_image_sources=send_image_sources,
user_oid=auth_claims.get("oid"),
)

return ExtraInfo(
Expand Down Expand Up @@ -226,7 +233,8 @@ async def run_agentic_retrieval_approach(
results_merge_strategy = overrides.get("results_merge_strategy", "interleaved")
# 50 is the amount of documents that the reranker can process per query
max_docs_for_reranker = max_subqueries * 50
send_image_sources = overrides.get("send_image_sources", True)
send_text_sources = overrides.get("send_text_sources", True)
send_image_sources = overrides.get("send_image_sources", self.multimodal_enabled) and self.multimodal_enabled

response, results = await self.run_agentic_retrieval(
messages,
Expand All @@ -242,6 +250,7 @@ async def run_agentic_retrieval_approach(
data_points = await self.get_sources_content(
results,
use_semantic_captions=False,
include_text_sources=send_text_sources,
download_image_sources=send_image_sources,
user_oid=auth_claims.get("oid"),
)
Expand Down
15 changes: 7 additions & 8 deletions app/frontend/src/pages/ask/Ask.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,13 @@ export function Component(): JSX.Element {
const [useQueryRewriting, setUseQueryRewriting] = useState<boolean>(false);
const [reasoningEffort, setReasoningEffort] = useState<string>("");
const [sendTextSources, setSendTextSources] = useState<boolean>(true);
const [sendImageSources, setSendImageSources] = useState<boolean>(true);
const [sendImageSources, setSendImageSources] = useState<boolean>(false);
const [includeCategory, setIncludeCategory] = useState<string>("");

const [excludeCategory, setExcludeCategory] = useState<string>("");
const [question, setQuestion] = useState<string>("");
const [searchTextEmbeddings, setSearchTextEmbeddings] = useState<boolean>(true);
const [searchImageEmbeddings, setSearchImageEmbeddings] = useState<boolean>(true);
const [searchImageEmbeddings, setSearchImageEmbeddings] = useState<boolean>(false);
const [useOidSecurityFilter, setUseOidSecurityFilter] = useState<boolean>(false);
const [useGroupsSecurityFilter, setUseGroupsSecurityFilter] = useState<boolean>(false);
const [showMultimodalOptions, setShowMultimodalOptions] = useState<boolean>(false);
Expand Down Expand Up @@ -87,12 +87,11 @@ export function Component(): JSX.Element {
configApi().then(config => {
setShowMultimodalOptions(config.showMultimodalOptions);
if (config.showMultimodalOptions) {
// Set default LLM inputs based on config override or fallback to Texts
setSendTextSources(true);
setSendImageSources(true);
// Set default vector field settings
setSearchTextEmbeddings(true);
setSearchImageEmbeddings(true);
// Initialize from server config so defaults follow deployment settings
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is how /chat was already doing it, fixed for consistency

setSendTextSources(config.ragSendTextSources !== undefined ? config.ragSendTextSources : true);
setSendImageSources(config.ragSendImageSources);
setSearchTextEmbeddings(config.ragSearchTextEmbeddings);
setSearchImageEmbeddings(config.ragSearchImageEmbeddings);
}
setUseSemanticRanker(config.showSemanticRankerOption);
setShowSemanticRankerOption(config.showSemanticRankerOption);
Expand Down
6 changes: 3 additions & 3 deletions app/frontend/src/pages/chat/Chat.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,11 @@ const Chat = () => {
const [excludeCategory, setExcludeCategory] = useState<string>("");
const [useSuggestFollowupQuestions, setUseSuggestFollowupQuestions] = useState<boolean>(false);
const [searchTextEmbeddings, setSearchTextEmbeddings] = useState<boolean>(true);
const [searchImageEmbeddings, setSearchImageEmbeddings] = useState<boolean>(true);
const [searchImageEmbeddings, setSearchImageEmbeddings] = useState<boolean>(false);
const [useOidSecurityFilter, setUseOidSecurityFilter] = useState<boolean>(false);
const [useGroupsSecurityFilter, setUseGroupsSecurityFilter] = useState<boolean>(false);
const [sendTextSources, setSendTextSources] = useState<boolean>(true);
const [sendImageSources, setSendImageSources] = useState<boolean>(true);
const [sendImageSources, setSendImageSources] = useState<boolean>(false);

const lastQuestionRef = useRef<string>("");
const chatMessageStreamEnd = useRef<HTMLDivElement | null>(null);
Expand Down Expand Up @@ -99,7 +99,7 @@ const Chat = () => {
configApi().then(config => {
setShowMultimodalOptions(config.showMultimodalOptions);
if (config.showMultimodalOptions) {
// Always have at least one source enabled, default to text if none specified
// Initialize from server config so defaults match deployment settings
setSendTextSources(config.ragSendTextSources !== undefined ? config.ragSendTextSources : true);
setSendImageSources(config.ragSendImageSources);
setSearchTextEmbeddings(config.ragSearchTextEmbeddings);
Expand Down
17 changes: 12 additions & 5 deletions tests/e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,14 +93,21 @@ def test_chat(sized_page: Page, live_server_url: str):

# Set up a mock route to the /chat endpoint with streaming results
def handle(route: Route):
# Assert that session_state is specified in the request (None for now)
try:
post_data = route.request.post_data_json
if post_data and "session_state" in post_data:
session_state = post_data["session_state"]
assert session_state is None
# Assert that session_state is specified (None initially)
if "session_state" in post_data:
assert post_data["session_state"] is None
overrides = post_data["context"]["overrides"]
# Assert that the default overrides are correct
assert overrides.get("send_text_sources") is True
assert overrides.get("send_image_sources") is False
assert overrides.get("search_text_embeddings") is True
assert overrides.get("search_image_embeddings") is False
# retrieval_mode may be explicitly "hybrid" or omitted (interpreted as hybrid)
assert overrides.get("retrieval_mode") in ["hybrid", None]
except Exception as e:
print(f"Error in test_chat handler: {e}")
print(f"Error in test_chat handler (defaults validation): {e}")

# Read the JSONL from our snapshot results and return as the response
f = open("tests/snapshots/test_app/test_chat_stream_text/client0/result.jsonlines")
Expand Down
38 changes: 38 additions & 0 deletions tests/test_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,44 @@ async def test_ask_rtr_text_semanticcaptions(client, snapshot):
snapshot.assert_match(json.dumps(result, indent=4), "result.json")


@pytest.mark.asyncio
@pytest.mark.parametrize("route", ["/ask", "/chat"])
async def test_send_text_sources_false(client, route):
"""When send_text_sources is False, text sources should be omitted while citations remain."""
response = await client.post(
route,
json={
"messages": [{"content": "What is the capital of France?", "role": "user"}],
"context": {"overrides": {"retrieval_mode": "text", "send_text_sources": False}},
},
)
assert response.status_code == 200
result = await response.get_json()
data_points = result["context"]["data_points"]
assert data_points["text"] == []
assert "citations" in data_points and len(data_points["citations"]) > 0


@pytest.mark.asyncio
@pytest.mark.parametrize("route", ["/ask", "/chat"])
async def test_search_image_embeddings_ignored_without_multimodal(client, route):
"""Sending search_image_embeddings=True when USE_MULTIMODAL is false should be ignored and still succeed (200)."""
response = await client.post(
route,
json={
"messages": [{"content": "What is the capital of France?", "role": "user"}],
"context": {"overrides": {"search_image_embeddings": True, "send_image_sources": True}},
},
)
assert response.status_code == 200
result = await response.get_json()
# Ensure the thought step recorded search_image_embeddings as False
search_thought = [
thought for thought in result["context"]["thoughts"] if thought["title"].startswith("Search using")
][0]
assert search_thought["props"]["search_image_embeddings"] is False


@pytest.mark.asyncio
async def test_ask_rtr_hybrid(client, snapshot):
response = await client.post(
Expand Down
Loading