Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions app/backend/approaches/approach.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,8 +372,10 @@ async def get_sources_content(
DataPoints: with text (list[str]), images (list[str - base64 data URI]), citations (list[str]).
"""

def nonewlines(s: str) -> str:
return s.replace("\n", " ").replace("\r", " ")
def clean_source(s: str) -> str:
s = s.replace("\n", " ").replace("\r", " ") # normalize newlines to spaces
s = s.replace(":::", ":::") # escape DocFX/markdown triple colons
return s
Comment on lines +376 to +378
Copy link
Preview

Copilot AI Oct 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The function modifies and reassigns the same variable 's' multiple times. Consider using a more functional approach by chaining the operations or using intermediate variables for better readability.

Suggested change
s = s.replace("\n", " ").replace("\r", " ") # normalize newlines to spaces
s = s.replace(":::", ":::") # escape DocFX/markdown triple colons
return s
normalized = s.replace("\n", " ").replace("\r", " ") # normalize newlines to spaces
escaped = normalized.replace(":::", ":::") # escape DocFX/markdown triple colons
return escaped

Copilot uses AI. Check for mistakes.


citations = []
text_sources = []
Expand All @@ -389,11 +391,10 @@ def nonewlines(s: str) -> str:
# If semantic captions are used, extract captions; otherwise, use content
if include_text_sources:
if use_semantic_captions and doc.captions:
text_sources.append(
f"{citation}: {nonewlines(' . '.join([cast(str, c.text) for c in doc.captions]))}"
)
cleaned = clean_source(" . ".join([cast(str, c.text) for c in doc.captions]))
else:
text_sources.append(f"{citation}: {nonewlines(doc.content or '')}")
cleaned = clean_source(doc.content or "")
text_sources.append(f"{citation}: {cleaned}")

if download_image_sources and hasattr(doc, "images") and doc.images:
for img in doc.images:
Expand Down
51 changes: 51 additions & 0 deletions tests/test_chatapproach.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from azure.search.documents.models import VectorizedQuery
from openai.types.chat import ChatCompletion

from approaches.approach import Document
from approaches.chatreadretrieveread import ChatReadRetrieveReadApproach
from approaches.promptmanager import PromptyManager
from prepdocslib.embeddings import ImageEmbeddings
Expand Down Expand Up @@ -255,3 +256,53 @@ async def test_compute_multimodal_embedding_no_client():
# Test that calling compute_multimodal_embedding raises a ValueError
with pytest.raises(ValueError, match="Approach is missing an image embeddings client for multimodal queries"):
await chat_approach.compute_multimodal_embedding("What's in this image?")


def test_chat_prompt_render_with_image_directive(chat_approach):
"""Verify DocFX style :::image directive is sanitized (replaced with [image]) during prompt rendering."""
image_directive = (
"activator-introduction.md#page=1: Intro text before image. "
':::image type="content" source="./media/activator-introduction/activator.png" '
'alt-text="Diagram that shows the architecture of Fabric Activator."::: More text after image.'
)
# Build Document and run get_sources_content to apply sanitization
import asyncio

async def build_sources():
return await chat_approach.get_sources_content(
[
Document(
id="doc1",
content=image_directive.split(": ", 1)[1],
sourcepage="activator-introduction.md#page=1",
sourcefile="activator-introduction.md",
)
],
use_semantic_captions=False,
include_text_sources=True,
download_image_sources=False,
user_oid=None,
)

data_points = asyncio.get_event_loop().run_until_complete(build_sources())

messages = chat_approach.prompt_manager.render_prompt(
chat_approach.answer_prompt,
{
"include_follow_up_questions": False,
"past_messages": [],
"user_query": "What is Fabric Activator?",
"text_sources": data_points.text,
"image_sources": data_points.images,
"citations": data_points.citations,
},
)
assert messages
# Find the user message containing Sources and verify placeholder
combined = "\n".join([m["content"] for m in messages if m["role"] == "user"]) # type: ignore
# Expect triple colons escaped
assert ":::image" in combined
assert "activator-introduction/activator.png" in combined
assert "Diagram that shows the architecture of Fabric Activator." in combined
# Original unescaped sequence should be gone
assert ":::image" not in combined