Add inline markdown processing for bold, italic, and code to HTML text formatting

btfranklin · btfranklin · commit 4a1b16b9824d · 2025-12-16T16:17:07.000-07:00
diff --git a/src/compendiumscribe/compendium/text_utils.py b/src/compendiumscribe/compendium/text_utils.py
@@ -68,8 +68,69 @@ def format_plain_text(text: str) -> str:
     return "".join(segments)
 
 
+def process_italic(text: str) -> str:
+    """Wrap *text* in <em> tags."""
+    # Split by *
+    parts_star = text.split("*")
+    processed_parts: list[str] = []
+    
+    for i, part in enumerate(parts_star):
+        if i % 2 == 1:
+            # Odd segments inside *
+            processed_parts.append(f"<em>{html.escape(part)}</em>")
+        else:
+            # Even segments outside *, handle _ next
+            sub_parts = part.split("_")
+            for j, sub in enumerate(sub_parts):
+                if j % 2 == 1:
+                    sub_parts[j] = f"<em>{html.escape(sub)}</em>"
+                else:
+                    sub_parts[j] = html.escape(sub)
+            processed_parts.append("".join(sub_parts))
+            
+    return "".join(processed_parts)
+
+
+def process_bold(text: str) -> str:
+    """Wrap **text** or __text__ in <strong> tags."""
+    # Split by ** first
+    parts = text.split("**")
+    processed_parts: list[str] = []
+    
+    for i, part in enumerate(parts):
+        if i % 2 == 1:
+            # Odd segments inside **
+            # Recurse for italics inside bold
+            processed_parts.append(f"<strong>{process_italic(part)}</strong>")
+        else:
+            # Even segments outside **, handle __ next
+            # Note: __ splits
+            sub_parts = part.split("__")
+            for j, sub in enumerate(sub_parts):
+                if j % 2 == 1:
+                    sub_parts[j] = f"<strong>{process_italic(sub)}</strong>"
+                else:
+                    sub_parts[j] = process_italic(sub)
+            processed_parts.append("".join(sub_parts))
+    
+    return "".join(processed_parts)
+
+
+def process_inline_markdown(text: str) -> str:
+    """Escape text for HTML and wrap inline markdown (code, bold, italic)."""
+    parts = text.split("`")
+    for i, part in enumerate(parts):
+        if i % 2 == 1:
+            # Odd segments are inside backticks: escaping only
+            parts[i] = f"<code>{html.escape(part)}</code>"
+        else:
+            # Even segments are outside backticks: process bold/italic
+            parts[i] = process_bold(part)
+    return "".join(parts)
+
+
 def format_html_text(text: str | None) -> str:
-    """Render Markdown-style links as HTML anchors while escaping content."""
+    """Render Markdown-style links and inline formatting as HTML."""
 
     if text is None:
         return ""
@@ -79,21 +140,26 @@ def format_html_text(text: str | None) -> str:
     parts: list[str] = []
     cursor = 0
     for start, end, label, url in iter_markdown_links(text):
-        parts.append(html.escape(text[cursor:start]))
+        # Process text before the link
+        parts.append(process_inline_markdown(text[cursor:start]))
+        
         clean_url = url.strip()
-        escaped_label = html.escape(label)
+        # Process markdown inside the link label
+        processed_label = process_inline_markdown(label)
+        
         if clean_url:
             escaped_url = html.escape(clean_url, quote=True)
             anchor = (
                 f"<a href=\"{escaped_url}\" "
-                f"rel=\"noopener noreferrer\">{escaped_label}</a>"
+                f"rel=\"noopener noreferrer\">{processed_label}</a>"
             )
             parts.append(anchor)
         else:
-            parts.append(escaped_label)
+            parts.append(processed_label)
         cursor = end
 
-    parts.append(html.escape(text[cursor:]))
+    # Process remaining text
+    parts.append(process_inline_markdown(text[cursor:]))
     return "".join(parts)
 
 
diff --git a/tests/compendium/test_text_utils.py b/tests/compendium/test_text_utils.py
@@ -0,0 +1,88 @@
+from __future__ import annotations
+
+from compendiumscribe.compendium.text_utils import format_html_text
+
+
+def test_format_html_text_with_plain_text():
+    text = "Hello world"
+    assert format_html_text(text) == "Hello world"
+
+
+def test_format_html_text_escapes_special_chars():
+    text = "User <user>"
+    assert format_html_text(text) == "User &lt;user&gt;"
+
+
+def test_format_html_text_converts_backticks_to_code():
+    text = "Use `print()` function"
+    assert format_html_text(text) == "Use <code>print()</code> function"
+
+
+def test_format_html_text_converts_backticks_with_html_chars():
+    text = "Check `x < y` logic"
+    assert format_html_text(text) == "Check <code>x &lt; y</code> logic"
+
+
+def test_format_html_text_handles_multiple_code_blocks():
+    text = "Use `foo` and `bar`"
+    assert format_html_text(text) == "Use <code>foo</code> and <code>bar</code>"
+
+
+def test_format_html_text_handles_unbalanced_backticks():
+    # If there's an odd number of backticks, the last one is treated as start of new <code> segment
+    # which goes to the end of string in our simple split model.
+    text = "Use `foo` and `bar"
+    assert format_html_text(text) == "Use <code>foo</code> and <code>bar</code>"
+
+
+def test_format_html_text_with_links_and_code():
+    # Verify processing of inline code within markdown link labels.
+    text = "Click [`here`](https://example.com) now"
+    expected = 'Click <a href="https://example.com" rel="noopener noreferrer"><code>here</code></a> now'
+    assert format_html_text(text) == expected
+
+
+def test_format_html_text_handles_bold():
+    text = "This is **bold** text"
+    assert format_html_text(text) == "This is <strong>bold</strong> text"
+
+
+def test_format_html_text_handles_italic_stars():
+    text = "This is *italic* text"
+    assert format_html_text(text) == "This is <em>italic</em> text"
+
+
+def test_format_html_text_handles_italic_underscores():
+    text = "This is _italic_ text"
+    # Assuming we treat _ underscores same as stars
+    # Usually Markdown allows _italic_
+    assert format_html_text(text) == "This is <em>italic</em> text"
+
+
+def test_format_html_text_handles_mixed_emphasis():
+    text = "**Bold** and *Italic*"
+    assert format_html_text(text) == "<strong>Bold</strong> and <em>Italic</em>"
+
+
+def test_format_html_text_handles_nested_bold_italic():
+    text = "**Bold *and* Italic**"
+    # Expect: <strong>Bold <em>and</em> Italic</strong>
+    assert format_html_text(text) == "<strong>Bold <em>and</em> Italic</strong>"
+
+
+def test_format_html_text_does_not_emphasize_code():
+    text = "`*code*` inside"
+    # Current code block keeps content raw/escaped but not emphasized
+    assert format_html_text(text) == "<code>*code*</code> inside"
+
+
+def test_format_html_text_ignores_bold_inside_code():
+    text = "Code `**not bold**` block"
+    assert format_html_text(text) == "Code <code>**not bold**</code> block"
+
+
+def test_format_html_text_ignores_underscore_inside_code():
+    text = "Code `_not italic_` block"
+    assert format_html_text(text) == "Code <code>_not italic_</code> block"
+
+
diff --git a/tests/research/test_orchestrator.py b/tests/research/test_orchestrator.py
@@ -182,12 +182,9 @@ def test_build_compendium_with_stub_client():
     # The input is now a list of message objects
     research_input = client.responses.calls[1]["input"]
     assert isinstance(research_input, list)
-    # Check if we can find the topic in the content of the user message
-    # e.g. input[-1]["content"] ... content might be list or str depending on promptdown/OAI
-    # Promptdown `to_responses_input` returns non-str content?
-    # Actually, `planning.py` calls `to_responses_input`.
-    # Let's just fuzzy match in the string representation for now, or drill down.
-    # We expect "Quantum Computing" in one of the messages.
+    # Check if we can find the topic in the content of the user message.
+    # The content structure varies (str or list of parts) depending on the source.
+    # We perform a robust check against both formats.
     found = False
     for msg in research_input:
         content = msg.get("content", "")