Skip to content

Commit 4a1b16b

Browse files
committed
Add inline markdown processing for bold, italic, and code to HTML text formatting
1 parent 294a9db commit 4a1b16b

File tree

3 files changed

+163
-12
lines changed

3 files changed

+163
-12
lines changed

src/compendiumscribe/compendium/text_utils.py

Lines changed: 72 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,69 @@ def format_plain_text(text: str) -> str:
6868
return "".join(segments)
6969

7070

71+
def process_italic(text: str) -> str:
72+
"""Wrap *text* in <em> tags."""
73+
# Split by *
74+
parts_star = text.split("*")
75+
processed_parts: list[str] = []
76+
77+
for i, part in enumerate(parts_star):
78+
if i % 2 == 1:
79+
# Odd segments inside *
80+
processed_parts.append(f"<em>{html.escape(part)}</em>")
81+
else:
82+
# Even segments outside *, handle _ next
83+
sub_parts = part.split("_")
84+
for j, sub in enumerate(sub_parts):
85+
if j % 2 == 1:
86+
sub_parts[j] = f"<em>{html.escape(sub)}</em>"
87+
else:
88+
sub_parts[j] = html.escape(sub)
89+
processed_parts.append("".join(sub_parts))
90+
91+
return "".join(processed_parts)
92+
93+
94+
def process_bold(text: str) -> str:
95+
"""Wrap **text** or __text__ in <strong> tags."""
96+
# Split by ** first
97+
parts = text.split("**")
98+
processed_parts: list[str] = []
99+
100+
for i, part in enumerate(parts):
101+
if i % 2 == 1:
102+
# Odd segments inside **
103+
# Recurse for italics inside bold
104+
processed_parts.append(f"<strong>{process_italic(part)}</strong>")
105+
else:
106+
# Even segments outside **, handle __ next
107+
# Note: __ splits
108+
sub_parts = part.split("__")
109+
for j, sub in enumerate(sub_parts):
110+
if j % 2 == 1:
111+
sub_parts[j] = f"<strong>{process_italic(sub)}</strong>"
112+
else:
113+
sub_parts[j] = process_italic(sub)
114+
processed_parts.append("".join(sub_parts))
115+
116+
return "".join(processed_parts)
117+
118+
119+
def process_inline_markdown(text: str) -> str:
120+
"""Escape text for HTML and wrap inline markdown (code, bold, italic)."""
121+
parts = text.split("`")
122+
for i, part in enumerate(parts):
123+
if i % 2 == 1:
124+
# Odd segments are inside backticks: escaping only
125+
parts[i] = f"<code>{html.escape(part)}</code>"
126+
else:
127+
# Even segments are outside backticks: process bold/italic
128+
parts[i] = process_bold(part)
129+
return "".join(parts)
130+
131+
71132
def format_html_text(text: str | None) -> str:
72-
"""Render Markdown-style links as HTML anchors while escaping content."""
133+
"""Render Markdown-style links and inline formatting as HTML."""
73134

74135
if text is None:
75136
return ""
@@ -79,21 +140,26 @@ def format_html_text(text: str | None) -> str:
79140
parts: list[str] = []
80141
cursor = 0
81142
for start, end, label, url in iter_markdown_links(text):
82-
parts.append(html.escape(text[cursor:start]))
143+
# Process text before the link
144+
parts.append(process_inline_markdown(text[cursor:start]))
145+
83146
clean_url = url.strip()
84-
escaped_label = html.escape(label)
147+
# Process markdown inside the link label
148+
processed_label = process_inline_markdown(label)
149+
85150
if clean_url:
86151
escaped_url = html.escape(clean_url, quote=True)
87152
anchor = (
88153
f"<a href=\"{escaped_url}\" "
89-
f"rel=\"noopener noreferrer\">{escaped_label}</a>"
154+
f"rel=\"noopener noreferrer\">{processed_label}</a>"
90155
)
91156
parts.append(anchor)
92157
else:
93-
parts.append(escaped_label)
158+
parts.append(processed_label)
94159
cursor = end
95160

96-
parts.append(html.escape(text[cursor:]))
161+
# Process remaining text
162+
parts.append(process_inline_markdown(text[cursor:]))
97163
return "".join(parts)
98164

99165

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
from __future__ import annotations
2+
3+
from compendiumscribe.compendium.text_utils import format_html_text
4+
5+
6+
def test_format_html_text_with_plain_text():
7+
text = "Hello world"
8+
assert format_html_text(text) == "Hello world"
9+
10+
11+
def test_format_html_text_escapes_special_chars():
12+
text = "User <user>"
13+
assert format_html_text(text) == "User &lt;user&gt;"
14+
15+
16+
def test_format_html_text_converts_backticks_to_code():
17+
text = "Use `print()` function"
18+
assert format_html_text(text) == "Use <code>print()</code> function"
19+
20+
21+
def test_format_html_text_converts_backticks_with_html_chars():
22+
text = "Check `x < y` logic"
23+
assert format_html_text(text) == "Check <code>x &lt; y</code> logic"
24+
25+
26+
def test_format_html_text_handles_multiple_code_blocks():
27+
text = "Use `foo` and `bar`"
28+
assert format_html_text(text) == "Use <code>foo</code> and <code>bar</code>"
29+
30+
31+
def test_format_html_text_handles_unbalanced_backticks():
32+
# If there's an odd number of backticks, the last one is treated as start of new <code> segment
33+
# which goes to the end of string in our simple split model.
34+
text = "Use `foo` and `bar"
35+
assert format_html_text(text) == "Use <code>foo</code> and <code>bar</code>"
36+
37+
38+
def test_format_html_text_with_links_and_code():
39+
# Verify processing of inline code within markdown link labels.
40+
text = "Click [`here`](https://example.com) now"
41+
expected = 'Click <a href="https://example.com" rel="noopener noreferrer"><code>here</code></a> now'
42+
assert format_html_text(text) == expected
43+
44+
45+
def test_format_html_text_handles_bold():
46+
text = "This is **bold** text"
47+
assert format_html_text(text) == "This is <strong>bold</strong> text"
48+
49+
50+
def test_format_html_text_handles_italic_stars():
51+
text = "This is *italic* text"
52+
assert format_html_text(text) == "This is <em>italic</em> text"
53+
54+
55+
def test_format_html_text_handles_italic_underscores():
56+
text = "This is _italic_ text"
57+
# Assuming we treat _ underscores same as stars
58+
# Usually Markdown allows _italic_
59+
assert format_html_text(text) == "This is <em>italic</em> text"
60+
61+
62+
def test_format_html_text_handles_mixed_emphasis():
63+
text = "**Bold** and *Italic*"
64+
assert format_html_text(text) == "<strong>Bold</strong> and <em>Italic</em>"
65+
66+
67+
def test_format_html_text_handles_nested_bold_italic():
68+
text = "**Bold *and* Italic**"
69+
# Expect: <strong>Bold <em>and</em> Italic</strong>
70+
assert format_html_text(text) == "<strong>Bold <em>and</em> Italic</strong>"
71+
72+
73+
def test_format_html_text_does_not_emphasize_code():
74+
text = "`*code*` inside"
75+
# Current code block keeps content raw/escaped but not emphasized
76+
assert format_html_text(text) == "<code>*code*</code> inside"
77+
78+
79+
def test_format_html_text_ignores_bold_inside_code():
80+
text = "Code `**not bold**` block"
81+
assert format_html_text(text) == "Code <code>**not bold**</code> block"
82+
83+
84+
def test_format_html_text_ignores_underscore_inside_code():
85+
text = "Code `_not italic_` block"
86+
assert format_html_text(text) == "Code <code>_not italic_</code> block"
87+
88+

tests/research/test_orchestrator.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -182,12 +182,9 @@ def test_build_compendium_with_stub_client():
182182
# The input is now a list of message objects
183183
research_input = client.responses.calls[1]["input"]
184184
assert isinstance(research_input, list)
185-
# Check if we can find the topic in the content of the user message
186-
# e.g. input[-1]["content"] ... content might be list or str depending on promptdown/OAI
187-
# Promptdown `to_responses_input` returns non-str content?
188-
# Actually, `planning.py` calls `to_responses_input`.
189-
# Let's just fuzzy match in the string representation for now, or drill down.
190-
# We expect "Quantum Computing" in one of the messages.
185+
# Check if we can find the topic in the content of the user message.
186+
# The content structure varies (str or list of parts) depending on the source.
187+
# We perform a robust check against both formats.
191188
found = False
192189
for msg in research_input:
193190
content = msg.get("content", "")

0 commit comments

Comments
 (0)