Skip to content

Commit efe3c77

Browse files
authored
Sanitize image markdown in sources (#2765)
* Sanitize image markdown and add test * Address feedback
1 parent dafcef7 commit efe3c77

File tree

2 files changed

+57
-6
lines changed

2 files changed

+57
-6
lines changed

app/backend/approaches/approach.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -372,8 +372,10 @@ async def get_sources_content(
372372
DataPoints: with text (list[str]), images (list[str - base64 data URI]), citations (list[str]).
373373
"""
374374

375-
def nonewlines(s: str) -> str:
376-
return s.replace("\n", " ").replace("\r", " ")
375+
def clean_source(s: str) -> str:
376+
s = s.replace("\n", " ").replace("\r", " ") # normalize newlines to spaces
377+
s = s.replace(":::", ":::") # escape DocFX/markdown triple colons
378+
return s
377379

378380
citations = []
379381
text_sources = []
@@ -389,11 +391,10 @@ def nonewlines(s: str) -> str:
389391
# If semantic captions are used, extract captions; otherwise, use content
390392
if include_text_sources:
391393
if use_semantic_captions and doc.captions:
392-
text_sources.append(
393-
f"{citation}: {nonewlines(' . '.join([cast(str, c.text) for c in doc.captions]))}"
394-
)
394+
cleaned = clean_source(" . ".join([cast(str, c.text) for c in doc.captions]))
395395
else:
396-
text_sources.append(f"{citation}: {nonewlines(doc.content or '')}")
396+
cleaned = clean_source(doc.content or "")
397+
text_sources.append(f"{citation}: {cleaned}")
397398

398399
if download_image_sources and hasattr(doc, "images") and doc.images:
399400
for img in doc.images:

tests/test_chatapproach.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from azure.search.documents.models import VectorizedQuery
77
from openai.types.chat import ChatCompletion
88

9+
from approaches.approach import Document
910
from approaches.chatreadretrieveread import ChatReadRetrieveReadApproach
1011
from approaches.promptmanager import PromptyManager
1112
from prepdocslib.embeddings import ImageEmbeddings
@@ -255,3 +256,52 @@ async def test_compute_multimodal_embedding_no_client():
255256
# Test that calling compute_multimodal_embedding raises a ValueError
256257
with pytest.raises(ValueError, match="Approach is missing an image embeddings client for multimodal queries"):
257258
await chat_approach.compute_multimodal_embedding("What's in this image?")
259+
260+
261+
@pytest.mark.asyncio
262+
async def test_chat_prompt_render_with_image_directive(chat_approach):
263+
"""Verify DocFX style :::image directive is sanitized (replaced with [image]) during prompt rendering."""
264+
image_directive = (
265+
"activator-introduction.md#page=1: Intro text before image. "
266+
':::image type="content" source="./media/activator-introduction/activator.png" '
267+
'alt-text="Diagram that shows the architecture of Fabric Activator."::: More text after image.'
268+
)
269+
270+
async def build_sources():
271+
return await chat_approach.get_sources_content(
272+
[
273+
Document(
274+
id="doc1",
275+
content=image_directive.split(": ", 1)[1],
276+
sourcepage="activator-introduction.md#page=1",
277+
sourcefile="activator-introduction.md",
278+
)
279+
],
280+
use_semantic_captions=False,
281+
include_text_sources=True,
282+
download_image_sources=False,
283+
user_oid=None,
284+
)
285+
286+
data_points = await build_sources()
287+
288+
messages = chat_approach.prompt_manager.render_prompt(
289+
chat_approach.answer_prompt,
290+
{
291+
"include_follow_up_questions": False,
292+
"past_messages": [],
293+
"user_query": "What is Fabric Activator?",
294+
"text_sources": data_points.text,
295+
"image_sources": data_points.images,
296+
"citations": data_points.citations,
297+
},
298+
)
299+
assert messages
300+
# Find the user message containing Sources and verify placeholder
301+
combined = "\n".join([m["content"] for m in messages if m["role"] == "user"])
302+
# Expect triple colons escaped
303+
assert ":::image" in combined
304+
assert "activator-introduction/activator.png" in combined
305+
assert "Diagram that shows the architecture of Fabric Activator." in combined
306+
# Original unescaped sequence should be gone
307+
assert ":::image" not in combined

0 commit comments

Comments
 (0)