Azure-Samples · pamelafox · Oct 7, 2025 · Oct 7, 2025 · Oct 7, 2025 · Copilot
diff --git a/app/backend/approaches/approach.py b/app/backend/approaches/approach.py
@@ -372,8 +372,10 @@ async def get_sources_content(
             DataPoints: with text (list[str]), images (list[str - base64 data URI]), citations (list[str]).
         """
 
-        def nonewlines(s: str) -> str:
-            return s.replace("\n", " ").replace("\r", " ")
+        def clean_source(s: str) -> str:
+            s = s.replace("\n", " ").replace("\r", " ")  # normalize newlines to spaces
+            s = s.replace(":::", "&#58;&#58;&#58;")  # escape DocFX/markdown triple colons
+            return s
-            s = s.replace("\n", " ").replace("\r", " ")  # normalize newlines to spaces
-            s = s.replace(":::", "&#58;&#58;&#58;")  # escape DocFX/markdown triple colons
-            return s
+            normalized = s.replace("\n", " ").replace("\r", " ")  # normalize newlines to spaces
+            escaped = normalized.replace(":::", "&#58;&#58;&#58;")  # escape DocFX/markdown triple colons
+            return escaped
-            s = s.replace("\n", " ").replace("\r", " ")  # normalize newlines to spaces
-            s = s.replace(":::", "&#58;&#58;&#58;")  # escape DocFX/markdown triple colons
-            return s
+            normalized = s.replace("\n", " ").replace("\r", " ")  # normalize newlines to spaces
+            escaped = normalized.replace(":::", "&#58;&#58;&#58;")  # escape DocFX/markdown triple colons
+            return escaped
 
         citations = []
         text_sources = []
@@ -389,11 +391,10 @@ def nonewlines(s: str) -> str:
             # If semantic captions are used, extract captions; otherwise, use content
             if include_text_sources:
                 if use_semantic_captions and doc.captions:
-                    text_sources.append(
-                        f"{citation}: {nonewlines(' . '.join([cast(str, c.text) for c in doc.captions]))}"
-                    )
+                    cleaned = clean_source(" . ".join([cast(str, c.text) for c in doc.captions]))
                 else:
-                    text_sources.append(f"{citation}: {nonewlines(doc.content or '')}")
+                    cleaned = clean_source(doc.content or "")
+                text_sources.append(f"{citation}: {cleaned}")
 
             if download_image_sources and hasattr(doc, "images") and doc.images:
                 for img in doc.images:

diff --git a/tests/test_chatapproach.py b/tests/test_chatapproach.py
@@ -6,6 +6,7 @@
 from azure.search.documents.models import VectorizedQuery
 from openai.types.chat import ChatCompletion
 
+from approaches.approach import Document
 from approaches.chatreadretrieveread import ChatReadRetrieveReadApproach
 from approaches.promptmanager import PromptyManager
 from prepdocslib.embeddings import ImageEmbeddings
@@ -255,3 +256,53 @@ async def test_compute_multimodal_embedding_no_client():
     # Test that calling compute_multimodal_embedding raises a ValueError
     with pytest.raises(ValueError, match="Approach is missing an image embeddings client for multimodal queries"):
         await chat_approach.compute_multimodal_embedding("What's in this image?")
+
+
+def test_chat_prompt_render_with_image_directive(chat_approach):
+    """Verify DocFX style :::image directive is sanitized (replaced with [image]) during prompt rendering."""
+    image_directive = (
+        "activator-introduction.md#page=1: Intro text before image. "
+        ':::image type="content" source="./media/activator-introduction/activator.png" '
+        'alt-text="Diagram that shows the architecture of Fabric Activator."::: More text after image.'
+    )
+    # Build Document and run get_sources_content to apply sanitization
+    import asyncio
+
+    async def build_sources():
+        return await chat_approach.get_sources_content(
+            [
+                Document(
+                    id="doc1",
+                    content=image_directive.split(": ", 1)[1],
+                    sourcepage="activator-introduction.md#page=1",
+                    sourcefile="activator-introduction.md",
+                )
+            ],
+            use_semantic_captions=False,
+            include_text_sources=True,
+            download_image_sources=False,
+            user_oid=None,
+        )
+
+    data_points = asyncio.get_event_loop().run_until_complete(build_sources())
+
+    messages = chat_approach.prompt_manager.render_prompt(
+        chat_approach.answer_prompt,
+        {
+            "include_follow_up_questions": False,
+            "past_messages": [],
+            "user_query": "What is Fabric Activator?",
+            "text_sources": data_points.text,
+            "image_sources": data_points.images,
+            "citations": data_points.citations,
+        },
+    )
+    assert messages
+    # Find the user message containing Sources and verify placeholder
+    combined = "\n".join([m["content"] for m in messages if m["role"] == "user"])  # type: ignore
+    # Expect triple colons escaped
+    assert "&#58;&#58;&#58;image" in combined
+    assert "activator-introduction/activator.png" in combined
+    assert "Diagram that shows the architecture of Fabric Activator." in combined
+    # Original unescaped sequence should be gone
+    assert ":::image" not in combined