diff --git a/app/backend/approaches/approach.py b/app/backend/approaches/approach.py index 04a74a8818..9dd05b1057 100644 --- a/app/backend/approaches/approach.py +++ b/app/backend/approaches/approach.py @@ -372,8 +372,10 @@ async def get_sources_content( DataPoints: with text (list[str]), images (list[str - base64 data URI]), citations (list[str]). """ - def nonewlines(s: str) -> str: - return s.replace("\n", " ").replace("\r", " ") + def clean_source(s: str) -> str: + s = s.replace("\n", " ").replace("\r", " ") # normalize newlines to spaces + s = s.replace(":::", ":::") # escape DocFX/markdown triple colons + return s citations = [] text_sources = [] @@ -389,11 +391,10 @@ def nonewlines(s: str) -> str: # If semantic captions are used, extract captions; otherwise, use content if include_text_sources: if use_semantic_captions and doc.captions: - text_sources.append( - f"{citation}: {nonewlines(' . '.join([cast(str, c.text) for c in doc.captions]))}" - ) + cleaned = clean_source(" . ".join([cast(str, c.text) for c in doc.captions])) else: - text_sources.append(f"{citation}: {nonewlines(doc.content or '')}") + cleaned = clean_source(doc.content or "") + text_sources.append(f"{citation}: {cleaned}") if download_image_sources and hasattr(doc, "images") and doc.images: for img in doc.images: diff --git a/tests/test_chatapproach.py b/tests/test_chatapproach.py index aa6145f273..fe131ab79a 100644 --- a/tests/test_chatapproach.py +++ b/tests/test_chatapproach.py @@ -6,6 +6,7 @@ from azure.search.documents.models import VectorizedQuery from openai.types.chat import ChatCompletion +from approaches.approach import Document from approaches.chatreadretrieveread import ChatReadRetrieveReadApproach from approaches.promptmanager import PromptyManager from prepdocslib.embeddings import ImageEmbeddings @@ -255,3 +256,52 @@ async def test_compute_multimodal_embedding_no_client(): # Test that calling compute_multimodal_embedding raises a ValueError with pytest.raises(ValueError, match="Approach is missing an image embeddings client for multimodal queries"): await chat_approach.compute_multimodal_embedding("What's in this image?") + + +@pytest.mark.asyncio +async def test_chat_prompt_render_with_image_directive(chat_approach): + """Verify DocFX style :::image directive is sanitized (replaced with [image]) during prompt rendering.""" + image_directive = ( + "activator-introduction.md#page=1: Intro text before image. " + ':::image type="content" source="./media/activator-introduction/activator.png" ' + 'alt-text="Diagram that shows the architecture of Fabric Activator."::: More text after image.' + ) + + async def build_sources(): + return await chat_approach.get_sources_content( + [ + Document( + id="doc1", + content=image_directive.split(": ", 1)[1], + sourcepage="activator-introduction.md#page=1", + sourcefile="activator-introduction.md", + ) + ], + use_semantic_captions=False, + include_text_sources=True, + download_image_sources=False, + user_oid=None, + ) + + data_points = await build_sources() + + messages = chat_approach.prompt_manager.render_prompt( + chat_approach.answer_prompt, + { + "include_follow_up_questions": False, + "past_messages": [], + "user_query": "What is Fabric Activator?", + "text_sources": data_points.text, + "image_sources": data_points.images, + "citations": data_points.citations, + }, + ) + assert messages + # Find the user message containing Sources and verify placeholder + combined = "\n".join([m["content"] for m in messages if m["role"] == "user"]) + # Expect triple colons escaped + assert ":::image" in combined + assert "activator-introduction/activator.png" in combined + assert "Diagram that shows the architecture of Fabric Activator." in combined + # Original unescaped sequence should be gone + assert ":::image" not in combined