From d343af142243a723be83c5a20ba54b5fd7c56891 Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Mon, 6 Oct 2025 18:35:10 -0700 Subject: [PATCH 1/2] Sanitize image markdown and add test --- app/backend/approaches/approach.py | 13 ++++---- tests/test_chatapproach.py | 51 ++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 6 deletions(-) diff --git a/app/backend/approaches/approach.py b/app/backend/approaches/approach.py index 04a74a8818..9dd05b1057 100644 --- a/app/backend/approaches/approach.py +++ b/app/backend/approaches/approach.py @@ -372,8 +372,10 @@ async def get_sources_content( DataPoints: with text (list[str]), images (list[str - base64 data URI]), citations (list[str]). """ - def nonewlines(s: str) -> str: - return s.replace("\n", " ").replace("\r", " ") + def clean_source(s: str) -> str: + s = s.replace("\n", " ").replace("\r", " ") # normalize newlines to spaces + s = s.replace(":::", ":::") # escape DocFX/markdown triple colons + return s citations = [] text_sources = [] @@ -389,11 +391,10 @@ def nonewlines(s: str) -> str: # If semantic captions are used, extract captions; otherwise, use content if include_text_sources: if use_semantic_captions and doc.captions: - text_sources.append( - f"{citation}: {nonewlines(' . '.join([cast(str, c.text) for c in doc.captions]))}" - ) + cleaned = clean_source(" . ".join([cast(str, c.text) for c in doc.captions])) else: - text_sources.append(f"{citation}: {nonewlines(doc.content or '')}") + cleaned = clean_source(doc.content or "") + text_sources.append(f"{citation}: {cleaned}") if download_image_sources and hasattr(doc, "images") and doc.images: for img in doc.images: diff --git a/tests/test_chatapproach.py b/tests/test_chatapproach.py index aa6145f273..21b5d38b18 100644 --- a/tests/test_chatapproach.py +++ b/tests/test_chatapproach.py @@ -6,6 +6,7 @@ from azure.search.documents.models import VectorizedQuery from openai.types.chat import ChatCompletion +from approaches.approach import Document from approaches.chatreadretrieveread import ChatReadRetrieveReadApproach from approaches.promptmanager import PromptyManager from prepdocslib.embeddings import ImageEmbeddings @@ -255,3 +256,53 @@ async def test_compute_multimodal_embedding_no_client(): # Test that calling compute_multimodal_embedding raises a ValueError with pytest.raises(ValueError, match="Approach is missing an image embeddings client for multimodal queries"): await chat_approach.compute_multimodal_embedding("What's in this image?") + + +def test_chat_prompt_render_with_image_directive(chat_approach): + """Verify DocFX style :::image directive is sanitized (replaced with [image]) during prompt rendering.""" + image_directive = ( + "activator-introduction.md#page=1: Intro text before image. " + ':::image type="content" source="./media/activator-introduction/activator.png" ' + 'alt-text="Diagram that shows the architecture of Fabric Activator."::: More text after image.' + ) + # Build Document and run get_sources_content to apply sanitization + import asyncio + + async def build_sources(): + return await chat_approach.get_sources_content( + [ + Document( + id="doc1", + content=image_directive.split(": ", 1)[1], + sourcepage="activator-introduction.md#page=1", + sourcefile="activator-introduction.md", + ) + ], + use_semantic_captions=False, + include_text_sources=True, + download_image_sources=False, + user_oid=None, + ) + + data_points = asyncio.get_event_loop().run_until_complete(build_sources()) + + messages = chat_approach.prompt_manager.render_prompt( + chat_approach.answer_prompt, + { + "include_follow_up_questions": False, + "past_messages": [], + "user_query": "What is Fabric Activator?", + "text_sources": data_points.text, + "image_sources": data_points.images, + "citations": data_points.citations, + }, + ) + assert messages + # Find the user message containing Sources and verify placeholder + combined = "\n".join([m["content"] for m in messages if m["role"] == "user"]) # type: ignore + # Expect triple colons escaped + assert ":::image" in combined + assert "activator-introduction/activator.png" in combined + assert "Diagram that shows the architecture of Fabric Activator." in combined + # Original unescaped sequence should be gone + assert ":::image" not in combined From 3edc3b23463a69c2cf9eed9256137ac534862d3c Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Mon, 6 Oct 2025 18:39:49 -0700 Subject: [PATCH 2/2] Address feedback --- tests/test_chatapproach.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/test_chatapproach.py b/tests/test_chatapproach.py index 21b5d38b18..fe131ab79a 100644 --- a/tests/test_chatapproach.py +++ b/tests/test_chatapproach.py @@ -258,15 +258,14 @@ async def test_compute_multimodal_embedding_no_client(): await chat_approach.compute_multimodal_embedding("What's in this image?") -def test_chat_prompt_render_with_image_directive(chat_approach): +@pytest.mark.asyncio +async def test_chat_prompt_render_with_image_directive(chat_approach): """Verify DocFX style :::image directive is sanitized (replaced with [image]) during prompt rendering.""" image_directive = ( "activator-introduction.md#page=1: Intro text before image. " ':::image type="content" source="./media/activator-introduction/activator.png" ' 'alt-text="Diagram that shows the architecture of Fabric Activator."::: More text after image.' ) - # Build Document and run get_sources_content to apply sanitization - import asyncio async def build_sources(): return await chat_approach.get_sources_content( @@ -284,7 +283,7 @@ async def build_sources(): user_oid=None, ) - data_points = asyncio.get_event_loop().run_until_complete(build_sources()) + data_points = await build_sources() messages = chat_approach.prompt_manager.render_prompt( chat_approach.answer_prompt, @@ -299,7 +298,7 @@ async def build_sources(): ) assert messages # Find the user message containing Sources and verify placeholder - combined = "\n".join([m["content"] for m in messages if m["role"] == "user"]) # type: ignore + combined = "\n".join([m["content"] for m in messages if m["role"] == "user"]) # Expect triple colons escaped assert ":::image" in combined assert "activator-introduction/activator.png" in combined