Skip to content

Commit 19ea520

Browse files
jlevyclaude
andcommitted
Add Markdoc/Jinja tag and HTML comment compatibility
Improves Flowmark handling of template tags by: - Preserving newlines around Jinja/Markdoc tags - Preserving newlines around HTML comment tags - Keeping paired tags atomic during wrapping - Refactoring tag logic into separate modules This enables compatibility with Markdoc, Markform, WordPress Gutenberg, and other systems that use block-level template tags. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 2be08d6 commit 19ea520

File tree

13 files changed

+1404
-30
lines changed

13 files changed

+1404
-30
lines changed

docs/project/specs/active/plan-2026-01-14-markdoc-tag-compatibility.md

Lines changed: 434 additions & 0 deletions
Large diffs are not rendered by default.

src/flowmark/formats/flowmark_markdown.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@
1616
from typing_extensions import override
1717

1818
from flowmark.linewrapping.line_wrappers import (
19-
LineWrapper,
2019
line_wrap_by_sentence,
2120
line_wrap_to_width,
2221
)
22+
from flowmark.linewrapping.protocols import LineWrapper
2323
from flowmark.linewrapping.text_filling import DEFAULT_WRAP_WIDTH
2424

2525

src/flowmark/linewrapping/line_wrappers.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
1+
from __future__ import annotations
2+
13
import re
24
from collections.abc import Callable
35
from typing import Protocol
46

7+
from flowmark.linewrapping.protocols import LineWrapper
58
from flowmark.linewrapping.sentence_split_regex import split_sentences_regex
9+
from flowmark.linewrapping.tag_handling import add_tag_newline_handling
610
from flowmark.linewrapping.text_filling import DEFAULT_WRAP_WIDTH
711
from flowmark.linewrapping.text_wrapping import (
812
DEFAULT_LEN_FUNCTION,
@@ -14,14 +18,6 @@
1418
"""Default minimum line length for sentence breaking."""
1519

1620

17-
class LineWrapper(Protocol):
18-
"""
19-
Takes a text string and any indents to use, and returns the wrapped text.
20-
"""
21-
22-
def __call__(self, text: str, initial_indent: str, subsequent_indent: str) -> str: ...
23-
24-
2521
class SentenceSplitter(Protocol):
2622
"""Takes a text string and returns a list of sentences."""
2723

@@ -97,7 +93,11 @@ def line_wrapper(text: str, initial_indent: str, subsequent_indent: str) -> str:
9793
)
9894

9995
if is_markdown:
100-
return _add_markdown_hard_break_handling(line_wrapper)
96+
# Apply tag newline handling first, then hard break handling
97+
# Order matters: tag handling should operate on original newlines
98+
# before hard break handling normalizes explicit breaks
99+
enhanced = add_tag_newline_handling(line_wrapper)
100+
return _add_markdown_hard_break_handling(enhanced)
101101
else:
102102
return line_wrapper
103103

@@ -166,6 +166,8 @@ def line_wrapper(text: str, initial_indent: str, subsequent_indent: str) -> str:
166166
return "\n".join(lines)
167167

168168
if is_markdown:
169-
return _add_markdown_hard_break_handling(line_wrapper)
169+
# Apply tag newline handling first, then hard break handling
170+
enhanced = add_tag_newline_handling(line_wrapper)
171+
return _add_markdown_hard_break_handling(enhanced)
170172
else:
171173
return line_wrapper

src/flowmark/linewrapping/markdown_filling.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@
1616
from flowmark.formats.flowmark_markdown import flowmark_markdown
1717
from flowmark.formats.frontmatter import split_frontmatter
1818
from flowmark.linewrapping.line_wrappers import (
19-
LineWrapper,
2019
line_wrap_by_sentence,
2120
line_wrap_to_width,
2221
)
22+
from flowmark.linewrapping.protocols import LineWrapper
2323
from flowmark.linewrapping.sentence_split_regex import split_sentences_regex
2424
from flowmark.linewrapping.text_filling import DEFAULT_WRAP_WIDTH
2525
from flowmark.transforms.doc_cleanups import doc_cleanups
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
"""
2+
Protocol definitions for the linewrapping module.
3+
"""
4+
5+
from __future__ import annotations
6+
7+
from typing import Protocol
8+
9+
10+
class LineWrapper(Protocol):
11+
"""
12+
Takes a text string and any indents to use, and returns the wrapped text.
13+
"""
14+
15+
def __call__(self, text: str, initial_indent: str, subsequent_indent: str) -> str: ...
Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
"""
2+
Tag handling for Jinja/Markdoc tags and HTML comments.
3+
4+
This module provides detection and handling of template tags used by systems like
5+
Markdoc, Markform, Jinja, Nunjucks, and WordPress Gutenberg.
6+
7+
The main concerns are:
8+
1. Detecting tag boundaries to preserve newlines around them
9+
2. Providing constants for tag delimiters used in word splitting patterns
10+
"""
11+
12+
from __future__ import annotations
13+
14+
from flowmark.linewrapping.protocols import LineWrapper
15+
16+
# Tag delimiters - all tag syntax defined in one place for consistency.
17+
#
18+
# Supported tag formats:
19+
# - Jinja/Markdoc: {% tag %}, {% /tag %}, {# comment #}, {{ variable }}
20+
# - HTML comments: <!-- tag -->, <!-- /tag -->
21+
22+
# Jinja/Markdoc template tags
23+
JINJA_TAG_OPEN = "{%"
24+
JINJA_TAG_CLOSE = "%}"
25+
# Jinja comments
26+
JINJA_COMMENT_OPEN = "{#"
27+
JINJA_COMMENT_CLOSE = "#}"
28+
# Jinja variables
29+
JINJA_VAR_OPEN = "{{"
30+
JINJA_VAR_CLOSE = "}}"
31+
# HTML comments
32+
HTML_COMMENT_OPEN = "<!--"
33+
HTML_COMMENT_CLOSE = "-->"
34+
35+
# Regex-escaped versions of delimiters (for use in regex patterns)
36+
JINJA_TAG_OPEN_RE = r"\{%"
37+
JINJA_TAG_CLOSE_RE = r"%\}"
38+
JINJA_COMMENT_OPEN_RE = r"\{#"
39+
JINJA_COMMENT_CLOSE_RE = r"#\}"
40+
JINJA_VAR_OPEN_RE = r"\{\{"
41+
JINJA_VAR_CLOSE_RE = r"\}\}"
42+
HTML_COMMENT_OPEN_RE = r"<!--"
43+
HTML_COMMENT_CLOSE_RE = r"-->"
44+
45+
46+
def line_ends_with_tag(line: str) -> bool:
47+
"""Check if a line ends with a Jinja/Markdoc tag or HTML comment."""
48+
stripped = line.rstrip()
49+
if not stripped:
50+
return False
51+
# Check for Jinja-style tags
52+
if (
53+
stripped.endswith(JINJA_TAG_CLOSE)
54+
or stripped.endswith(JINJA_COMMENT_CLOSE)
55+
or stripped.endswith(JINJA_VAR_CLOSE)
56+
):
57+
return True
58+
# Check for HTML comments
59+
if stripped.endswith(HTML_COMMENT_CLOSE):
60+
return True
61+
return False
62+
63+
64+
def line_starts_with_tag(line: str) -> bool:
65+
"""Check if a line starts with a Jinja/Markdoc tag or HTML comment."""
66+
stripped = line.lstrip()
67+
if not stripped:
68+
return False
69+
# Check for Jinja-style tags
70+
if (
71+
stripped.startswith(JINJA_TAG_OPEN)
72+
or stripped.startswith(JINJA_COMMENT_OPEN)
73+
or stripped.startswith(JINJA_VAR_OPEN)
74+
):
75+
return True
76+
# Check for HTML comments
77+
if stripped.startswith(HTML_COMMENT_OPEN):
78+
return True
79+
return False
80+
81+
82+
def add_tag_newline_handling(base_wrapper: LineWrapper) -> LineWrapper:
83+
"""
84+
Augments a LineWrapper to preserve newlines around Jinja/Markdoc tags
85+
and HTML comments.
86+
87+
When a line ends with a tag or the next line starts with a tag,
88+
the newline between them is preserved rather than being normalized
89+
away during text wrapping.
90+
91+
This enables compatibility with Markdoc, Markform, and similar systems
92+
that use block-level tags like `{% field %}...{% /field %}`.
93+
94+
IMPORTANT LIMITATION: This operates at the line-wrapping level, AFTER
95+
Markdown parsing. If the Markdown parser (Marko) has already interpreted
96+
content as part of a block element (e.g., list item continuation), we
97+
cannot undo that structure. For example:
98+
99+
- list item
100+
{% /tag %}
101+
102+
The parser may treat `{% /tag %}` as list continuation, causing it to
103+
be indented. The newline IS preserved, but indentation is added.
104+
105+
WORKAROUND: Use blank lines around block elements inside tags:
106+
107+
{% field %}
108+
109+
- Item 1
110+
- Item 2
111+
112+
{% /field %}
113+
"""
114+
115+
def enhanced_wrapper(text: str, initial_indent: str, subsequent_indent: str) -> str:
116+
# If no newlines, nothing to preserve
117+
if "\n" not in text:
118+
return base_wrapper(text, initial_indent, subsequent_indent)
119+
120+
lines = text.split("\n")
121+
122+
# If only one line after split, nothing to preserve
123+
if len(lines) <= 1:
124+
return base_wrapper(text, initial_indent, subsequent_indent)
125+
126+
# Group lines into segments that should be wrapped together
127+
# A new segment starts when:
128+
# - The previous line ends with a tag
129+
# - The current line starts with a tag
130+
segments: list[str] = []
131+
current_segment_lines: list[str] = []
132+
133+
for i, line in enumerate(lines):
134+
is_first_line = i == 0
135+
prev_ends_with_tag = not is_first_line and line_ends_with_tag(lines[i - 1])
136+
curr_starts_with_tag = line_starts_with_tag(line)
137+
138+
# Start a new segment if there's a tag boundary
139+
if prev_ends_with_tag or curr_starts_with_tag:
140+
if current_segment_lines:
141+
segments.append("\n".join(current_segment_lines))
142+
current_segment_lines = []
143+
144+
current_segment_lines.append(line)
145+
146+
# Don't forget the last segment
147+
if current_segment_lines:
148+
segments.append("\n".join(current_segment_lines))
149+
150+
# If we only have one segment, no tag boundaries were found
151+
if len(segments) == 1:
152+
return base_wrapper(text, initial_indent, subsequent_indent)
153+
154+
# Wrap each segment separately and rejoin with newlines
155+
wrapped_segments: list[str] = []
156+
for i, segment in enumerate(segments):
157+
is_first = i == 0
158+
cur_initial_indent = initial_indent if is_first else subsequent_indent
159+
wrapped = base_wrapper(segment, cur_initial_indent, subsequent_indent)
160+
wrapped_segments.append(wrapped)
161+
162+
return "\n".join(wrapped_segments)
163+
164+
return enhanced_wrapper

src/flowmark/linewrapping/text_wrapping.py

Lines changed: 76 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,31 @@
44
from collections.abc import Callable
55
from typing import Protocol
66

7+
from flowmark.linewrapping.tag_handling import (
8+
HTML_COMMENT_CLOSE_RE as _HTML_COMMENT_CLOSE_RE,
9+
)
10+
from flowmark.linewrapping.tag_handling import (
11+
HTML_COMMENT_OPEN_RE as _HTML_COMMENT_OPEN_RE,
12+
)
13+
from flowmark.linewrapping.tag_handling import (
14+
JINJA_COMMENT_CLOSE_RE as _JINJA_COMMENT_CLOSE_RE,
15+
)
16+
from flowmark.linewrapping.tag_handling import (
17+
JINJA_COMMENT_OPEN_RE as _JINJA_COMMENT_OPEN_RE,
18+
)
19+
from flowmark.linewrapping.tag_handling import (
20+
JINJA_TAG_CLOSE_RE as _JINJA_TAG_CLOSE_RE,
21+
)
22+
from flowmark.linewrapping.tag_handling import (
23+
JINJA_TAG_OPEN_RE as _JINJA_TAG_OPEN_RE,
24+
)
25+
from flowmark.linewrapping.tag_handling import (
26+
JINJA_VAR_CLOSE_RE as _JINJA_VAR_CLOSE_RE,
27+
)
28+
from flowmark.linewrapping.tag_handling import (
29+
JINJA_VAR_OPEN_RE as _JINJA_VAR_OPEN_RE,
30+
)
31+
732
DEFAULT_LEN_FUNCTION = len
833
"""
934
Default length function to use for wrapping.
@@ -66,6 +91,9 @@ class _HtmlMdWordSplitter:
6691
This is compatible with CommonMark because we don't interpret code span
6792
content—we just keep tokens together for sensible line wrapping.
6893
See: https://spec.commonmark.org/0.31.2/#code-spans
94+
95+
Note: This class runs AFTER Markdown parsing, so any CommonMark escape
96+
sequences will have already been processed by Marko before we see the text.
6997
"""
7098

7199
# Pattern to detect COMPLETE inline code spans (both opening and closing backticks
@@ -81,6 +109,18 @@ def __init__(self):
81109
# Each pattern is a tuple of regexes: (start, middle..., end).
82110
# All tag types support up to MAX_TAG_WORDS words.
83111
self.patterns: list[tuple[str, ...]] = [
112+
# Paired Jinja/Markdoc tags: {% tag %}{% /tag %} (with optional space between)
113+
# This handles empty fields like {% field %}{% /field %}
114+
# Must come before single tag patterns so it matches first
115+
(
116+
rf".*{_JINJA_TAG_CLOSE_RE}",
117+
rf"{_JINJA_TAG_OPEN_RE}\s*/.*{_JINJA_TAG_CLOSE_RE}",
118+
),
119+
# Paired HTML comment tags: <!-- tag --><!-- /tag -->
120+
(
121+
rf".*{_HTML_COMMENT_CLOSE_RE}",
122+
rf"{_HTML_COMMENT_OPEN_RE}\s*/.*{_HTML_COMMENT_CLOSE_RE}",
123+
),
84124
# Inline code spans with spaces: `code with spaces`
85125
# Per CommonMark, code spans are delimited by equal-length backtick strings.
86126
# We coalesce words between opening ` and closing ` to keep them atomic.
@@ -93,8 +133,8 @@ def __init__(self):
93133
# HTML comments: <!-- comment text -->
94134
# Keep inline comments together, don't force to separate lines
95135
*_generate_tag_patterns(
96-
start=r"<!--.*",
97-
end=r".*-->",
136+
start=rf"{_HTML_COMMENT_OPEN_RE}.*",
137+
end=rf".*{_HTML_COMMENT_CLOSE_RE}",
98138
middle=r".+",
99139
),
100140
# HTML/XML tags: <tag attr="value">content</tag>
@@ -113,20 +153,20 @@ def __init__(self):
113153
),
114154
# Template tags {% ... %} (Markdoc/Jinja/Nunjucks)
115155
*_generate_tag_patterns(
116-
start=r"\{%",
117-
end=r".*%\}",
156+
start=_JINJA_TAG_OPEN_RE,
157+
end=rf".*{_JINJA_TAG_CLOSE_RE}",
118158
middle=r".+",
119159
),
120160
# Template comments {# ... #} (Jinja/Nunjucks)
121161
*_generate_tag_patterns(
122-
start=r"\{#",
123-
end=r".*#\}",
162+
start=_JINJA_COMMENT_OPEN_RE,
163+
end=rf".*{_JINJA_COMMENT_CLOSE_RE}",
124164
middle=r".+",
125165
),
126166
# Template variables {{ ... }} (Jinja/Nunjucks)
127167
*_generate_tag_patterns(
128-
start=r"\{\{",
129-
end=r".*\}\}",
168+
start=_JINJA_VAR_OPEN_RE,
169+
end=rf".*{_JINJA_VAR_CLOSE_RE}",
130170
middle=r".+",
131171
),
132172
]
@@ -135,7 +175,35 @@ def __init__(self):
135175
for pattern_group in self.patterns
136176
]
137177

178+
# Pattern to find adjacent tags (closing tag immediately followed by opening tag)
179+
# This handles cases like %}{% or --><!-- where there's no space between
180+
_adjacent_tags_re: re.Pattern[str] = re.compile(
181+
rf"({_JINJA_TAG_CLOSE_RE})({_JINJA_TAG_OPEN_RE})|"
182+
rf"({_JINJA_COMMENT_CLOSE_RE})({_JINJA_COMMENT_OPEN_RE})|"
183+
rf"({_JINJA_VAR_CLOSE_RE})({_JINJA_VAR_OPEN_RE})|"
184+
rf"({_HTML_COMMENT_CLOSE_RE})({_HTML_COMMENT_OPEN_RE})"
185+
)
186+
187+
def _normalize_adjacent_tags(self, text: str) -> str:
188+
"""
189+
Add a space between adjacent tags so they become separate tokens.
190+
For example: %}{% becomes %} {%
191+
"""
192+
193+
def add_space(match: re.Match[str]) -> str:
194+
# Find which group matched and insert space between closing and opening
195+
groups = match.groups()
196+
for i in range(0, len(groups), 2):
197+
if groups[i] is not None:
198+
return groups[i] + " " + groups[i + 1]
199+
return match.group(0)
200+
201+
return self._adjacent_tags_re.sub(add_space, text)
202+
138203
def __call__(self, text: str) -> list[str]:
204+
# First normalize adjacent tags to ensure proper tokenization
205+
text = self._normalize_adjacent_tags(text)
206+
139207
words = text.split()
140208
result: list[str] = []
141209
i = 0

0 commit comments

Comments
 (0)