jlevy
diff --git a/‎docs/project/specs/active/plan-2026-01-14-markdoc-tag-compatibility.md‎
Lines changed: 434 additions & 0 deletions b/‎docs/project/specs/active/plan-2026-01-14-markdoc-tag-compatibility.md‎
Lines changed: 434 additions & 0 deletions
diff --git a/‎src/flowmark/formats/flowmark_markdown.py‎
Lines changed: 1 addition & 1 deletion b/‎src/flowmark/formats/flowmark_markdown.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/flowmark/linewrapping/line_wrappers.py‎
Lines changed: 12 additions & 10 deletions b/‎src/flowmark/linewrapping/line_wrappers.py‎
Lines changed: 12 additions & 10 deletions
diff --git a/‎src/flowmark/linewrapping/markdown_filling.py‎
Lines changed: 1 addition & 1 deletion b/‎src/flowmark/linewrapping/markdown_filling.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/flowmark/linewrapping/protocols.py‎
Lines changed: 15 additions & 0 deletions b/‎src/flowmark/linewrapping/protocols.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎src/flowmark/linewrapping/tag_handling.py‎
Lines changed: 164 additions & 0 deletions b/‎src/flowmark/linewrapping/tag_handling.py‎
Lines changed: 164 additions & 0 deletions
diff --git a/‎src/flowmark/linewrapping/text_wrapping.py‎
Lines changed: 76 additions & 8 deletions b/‎src/flowmark/linewrapping/text_wrapping.py‎
Lines changed: 76 additions & 8 deletions
@@ -16,10 +16,10 @@
 from typing_extensions import override
 
 from flowmark.linewrapping.line_wrappers import (
-    LineWrapper,
     line_wrap_by_sentence,
     line_wrap_to_width,
 )
+from flowmark.linewrapping.protocols import LineWrapper
 from flowmark.linewrapping.text_filling import DEFAULT_WRAP_WIDTH
 
 
 
@@ -1,8 +1,12 @@
+from __future__ import annotations
+
 import re
 from collections.abc import Callable
 from typing import Protocol
 
+from flowmark.linewrapping.protocols import LineWrapper
 from flowmark.linewrapping.sentence_split_regex import split_sentences_regex
+from flowmark.linewrapping.tag_handling import add_tag_newline_handling
 from flowmark.linewrapping.text_filling import DEFAULT_WRAP_WIDTH
 from flowmark.linewrapping.text_wrapping import (
     DEFAULT_LEN_FUNCTION,
@@ -14,14 +18,6 @@
 """Default minimum line length for sentence breaking."""
 
 
-class LineWrapper(Protocol):
-    """
-    Takes a text string and any indents to use, and returns the wrapped text.
-    """
-
-    def __call__(self, text: str, initial_indent: str, subsequent_indent: str) -> str: ...
-
-
 class SentenceSplitter(Protocol):
     """Takes a text string and returns a list of sentences."""
 
@@ -97,7 +93,11 @@ def line_wrapper(text: str, initial_indent: str, subsequent_indent: str) -> str:
         )
 
     if is_markdown:
-        return _add_markdown_hard_break_handling(line_wrapper)
+        # Apply tag newline handling first, then hard break handling
+        # Order matters: tag handling should operate on original newlines
+        # before hard break handling normalizes explicit breaks
+        enhanced = add_tag_newline_handling(line_wrapper)
+        return _add_markdown_hard_break_handling(enhanced)
     else:
         return line_wrapper
 
@@ -166,6 +166,8 @@ def line_wrapper(text: str, initial_indent: str, subsequent_indent: str) -> str:
         return "\n".join(lines)
 
     if is_markdown:
-        return _add_markdown_hard_break_handling(line_wrapper)
+        # Apply tag newline handling first, then hard break handling
+        enhanced = add_tag_newline_handling(line_wrapper)
+        return _add_markdown_hard_break_handling(enhanced)
     else:
         return line_wrapper
@@ -16,10 +16,10 @@
 from flowmark.formats.flowmark_markdown import flowmark_markdown
 from flowmark.formats.frontmatter import split_frontmatter
 from flowmark.linewrapping.line_wrappers import (
-    LineWrapper,
     line_wrap_by_sentence,
     line_wrap_to_width,
 )
+from flowmark.linewrapping.protocols import LineWrapper
 from flowmark.linewrapping.sentence_split_regex import split_sentences_regex
 from flowmark.linewrapping.text_filling import DEFAULT_WRAP_WIDTH
 from flowmark.transforms.doc_cleanups import doc_cleanups
 
@@ -0,0 +1,15 @@
+"""
+Protocol definitions for the linewrapping module.
+"""
+
+from __future__ import annotations
+
+from typing import Protocol
+
+
+class LineWrapper(Protocol):
+    """
+    Takes a text string and any indents to use, and returns the wrapped text.
+    """
+
+    def __call__(self, text: str, initial_indent: str, subsequent_indent: str) -> str: ...
@@ -0,0 +1,164 @@
+"""
+Tag handling for Jinja/Markdoc tags and HTML comments.
+
+This module provides detection and handling of template tags used by systems like
+Markdoc, Markform, Jinja, Nunjucks, and WordPress Gutenberg.
+
+The main concerns are:
+1. Detecting tag boundaries to preserve newlines around them
+2. Providing constants for tag delimiters used in word splitting patterns
+"""
+
+from __future__ import annotations
+
+from flowmark.linewrapping.protocols import LineWrapper
+
+# Tag delimiters - all tag syntax defined in one place for consistency.
+#
+# Supported tag formats:
+# - Jinja/Markdoc: {% tag %}, {% /tag %}, {# comment #}, {{ variable }}
+# - HTML comments: <!-- tag -->, <!-- /tag -->
+
+# Jinja/Markdoc template tags
+JINJA_TAG_OPEN = "{%"
+JINJA_TAG_CLOSE = "%}"
+# Jinja comments
+JINJA_COMMENT_OPEN = "{#"
+JINJA_COMMENT_CLOSE = "#}"
+# Jinja variables
+JINJA_VAR_OPEN = "{{"
+JINJA_VAR_CLOSE = "}}"
+# HTML comments
+HTML_COMMENT_OPEN = "<!--"
+HTML_COMMENT_CLOSE = "-->"
+
+# Regex-escaped versions of delimiters (for use in regex patterns)
+JINJA_TAG_OPEN_RE = r"\{%"
+JINJA_TAG_CLOSE_RE = r"%\}"
+JINJA_COMMENT_OPEN_RE = r"\{#"
+JINJA_COMMENT_CLOSE_RE = r"#\}"
+JINJA_VAR_OPEN_RE = r"\{\{"
+JINJA_VAR_CLOSE_RE = r"\}\}"
+HTML_COMMENT_OPEN_RE = r"<!--"
+HTML_COMMENT_CLOSE_RE = r"-->"
+
+
+def line_ends_with_tag(line: str) -> bool:
+    """Check if a line ends with a Jinja/Markdoc tag or HTML comment."""
+    stripped = line.rstrip()
+    if not stripped:
+        return False
+    # Check for Jinja-style tags
+    if (
+        stripped.endswith(JINJA_TAG_CLOSE)
+        or stripped.endswith(JINJA_COMMENT_CLOSE)
+        or stripped.endswith(JINJA_VAR_CLOSE)
+    ):
+        return True
+    # Check for HTML comments
+    if stripped.endswith(HTML_COMMENT_CLOSE):
+        return True
+    return False
+
+
+def line_starts_with_tag(line: str) -> bool:
+    """Check if a line starts with a Jinja/Markdoc tag or HTML comment."""
+    stripped = line.lstrip()
+    if not stripped:
+        return False
+    # Check for Jinja-style tags
+    if (
+        stripped.startswith(JINJA_TAG_OPEN)
+        or stripped.startswith(JINJA_COMMENT_OPEN)
+        or stripped.startswith(JINJA_VAR_OPEN)
+    ):
+        return True
+    # Check for HTML comments
+    if stripped.startswith(HTML_COMMENT_OPEN):
+        return True
+    return False
+
+
+def add_tag_newline_handling(base_wrapper: LineWrapper) -> LineWrapper:
+    """
+    Augments a LineWrapper to preserve newlines around Jinja/Markdoc tags
+    and HTML comments.
+
+    When a line ends with a tag or the next line starts with a tag,
+    the newline between them is preserved rather than being normalized
+    away during text wrapping.
+
+    This enables compatibility with Markdoc, Markform, and similar systems
+    that use block-level tags like `{% field %}...{% /field %}`.
+
+    IMPORTANT LIMITATION: This operates at the line-wrapping level, AFTER
+    Markdown parsing. If the Markdown parser (Marko) has already interpreted
+    content as part of a block element (e.g., list item continuation), we
+    cannot undo that structure. For example:
+
+        - list item
+        {% /tag %}
+
+    The parser may treat `{% /tag %}` as list continuation, causing it to
+    be indented. The newline IS preserved, but indentation is added.
+
+    WORKAROUND: Use blank lines around block elements inside tags:
+
+        {% field %}
+
+        - Item 1
+        - Item 2
+
+        {% /field %}
+    """
+
+    def enhanced_wrapper(text: str, initial_indent: str, subsequent_indent: str) -> str:
+        # If no newlines, nothing to preserve
+        if "\n" not in text:
+            return base_wrapper(text, initial_indent, subsequent_indent)
+
+        lines = text.split("\n")
+
+        # If only one line after split, nothing to preserve
+        if len(lines) <= 1:
+            return base_wrapper(text, initial_indent, subsequent_indent)
+
+        # Group lines into segments that should be wrapped together
+        # A new segment starts when:
+        # - The previous line ends with a tag
+        # - The current line starts with a tag
+        segments: list[str] = []
+        current_segment_lines: list[str] = []
+
+        for i, line in enumerate(lines):
+            is_first_line = i == 0
+            prev_ends_with_tag = not is_first_line and line_ends_with_tag(lines[i - 1])
+            curr_starts_with_tag = line_starts_with_tag(line)
+
+            # Start a new segment if there's a tag boundary
+            if prev_ends_with_tag or curr_starts_with_tag:
+                if current_segment_lines:
+                    segments.append("\n".join(current_segment_lines))
+                    current_segment_lines = []
+
+            current_segment_lines.append(line)
+
+        # Don't forget the last segment
+        if current_segment_lines:
+            segments.append("\n".join(current_segment_lines))
+
+        # If we only have one segment, no tag boundaries were found
+        if len(segments) == 1:
+            return base_wrapper(text, initial_indent, subsequent_indent)
+
+        # Wrap each segment separately and rejoin with newlines
+        wrapped_segments: list[str] = []
+        for i, segment in enumerate(segments):
+            is_first = i == 0
+            cur_initial_indent = initial_indent if is_first else subsequent_indent
+            wrapped = base_wrapper(segment, cur_initial_indent, subsequent_indent)
+            wrapped_segments.append(wrapped)
+
+        return "\n".join(wrapped_segments)
+
+    return enhanced_wrapper
@@ -4,6 +4,31 @@
 from collections.abc import Callable
 from typing import Protocol
 
+from flowmark.linewrapping.tag_handling import (
+    HTML_COMMENT_CLOSE_RE as _HTML_COMMENT_CLOSE_RE,
+)
+from flowmark.linewrapping.tag_handling import (
+    HTML_COMMENT_OPEN_RE as _HTML_COMMENT_OPEN_RE,
+)
+from flowmark.linewrapping.tag_handling import (
+    JINJA_COMMENT_CLOSE_RE as _JINJA_COMMENT_CLOSE_RE,
+)
+from flowmark.linewrapping.tag_handling import (
+    JINJA_COMMENT_OPEN_RE as _JINJA_COMMENT_OPEN_RE,
+)
+from flowmark.linewrapping.tag_handling import (
+    JINJA_TAG_CLOSE_RE as _JINJA_TAG_CLOSE_RE,
+)
+from flowmark.linewrapping.tag_handling import (
+    JINJA_TAG_OPEN_RE as _JINJA_TAG_OPEN_RE,
+)
+from flowmark.linewrapping.tag_handling import (
+    JINJA_VAR_CLOSE_RE as _JINJA_VAR_CLOSE_RE,
+)
+from flowmark.linewrapping.tag_handling import (
+    JINJA_VAR_OPEN_RE as _JINJA_VAR_OPEN_RE,
+)
+
 DEFAULT_LEN_FUNCTION = len
 """
 Default length function to use for wrapping.
@@ -66,6 +91,9 @@ class _HtmlMdWordSplitter:
     This is compatible with CommonMark because we don't interpret code span
     content—we just keep tokens together for sensible line wrapping.
     See: https://spec.commonmark.org/0.31.2/#code-spans
+
+    Note: This class runs AFTER Markdown parsing, so any CommonMark escape
+    sequences will have already been processed by Marko before we see the text.
     """
 
     # Pattern to detect COMPLETE inline code spans (both opening and closing backticks
@@ -81,6 +109,18 @@ def __init__(self):
         # Each pattern is a tuple of regexes: (start, middle..., end).
         # All tag types support up to MAX_TAG_WORDS words.
         self.patterns: list[tuple[str, ...]] = [
+            # Paired Jinja/Markdoc tags: {% tag %}{% /tag %} (with optional space between)
+            # This handles empty fields like {% field %}{% /field %}
+            # Must come before single tag patterns so it matches first
+            (
+                rf".*{_JINJA_TAG_CLOSE_RE}",
+                rf"{_JINJA_TAG_OPEN_RE}\s*/.*{_JINJA_TAG_CLOSE_RE}",
+            ),
+            # Paired HTML comment tags: <!-- tag --><!-- /tag -->
+            (
+                rf".*{_HTML_COMMENT_CLOSE_RE}",
+                rf"{_HTML_COMMENT_OPEN_RE}\s*/.*{_HTML_COMMENT_CLOSE_RE}",
+            ),
             # Inline code spans with spaces: `code with spaces`
             # Per CommonMark, code spans are delimited by equal-length backtick strings.
             # We coalesce words between opening ` and closing ` to keep them atomic.
@@ -93,8 +133,8 @@ def __init__(self):
             # HTML comments: <!-- comment text -->
             # Keep inline comments together, don't force to separate lines
             *_generate_tag_patterns(
-                start=r"<!--.*",
-                end=r".*-->",
+                start=rf"{_HTML_COMMENT_OPEN_RE}.*",
+                end=rf".*{_HTML_COMMENT_CLOSE_RE}",
                 middle=r".+",
             ),
             # HTML/XML tags: <tag attr="value">content</tag>
@@ -113,20 +153,20 @@ def __init__(self):
             ),
             # Template tags {% ... %} (Markdoc/Jinja/Nunjucks)
             *_generate_tag_patterns(
-                start=r"\{%",
-                end=r".*%\}",
+                start=_JINJA_TAG_OPEN_RE,
+                end=rf".*{_JINJA_TAG_CLOSE_RE}",
                 middle=r".+",
             ),
             # Template comments {# ... #} (Jinja/Nunjucks)
             *_generate_tag_patterns(
-                start=r"\{#",
-                end=r".*#\}",
+                start=_JINJA_COMMENT_OPEN_RE,
+                end=rf".*{_JINJA_COMMENT_CLOSE_RE}",
                 middle=r".+",
             ),
             # Template variables {{ ... }} (Jinja/Nunjucks)
             *_generate_tag_patterns(
-                start=r"\{\{",
-                end=r".*\}\}",
+                start=_JINJA_VAR_OPEN_RE,
+                end=rf".*{_JINJA_VAR_CLOSE_RE}",
                 middle=r".+",
             ),
         ]
@@ -135,7 +175,35 @@ def __init__(self):
             for pattern_group in self.patterns
         ]
 
+    # Pattern to find adjacent tags (closing tag immediately followed by opening tag)
+    # This handles cases like %}{% or --><!-- where there's no space between
+    _adjacent_tags_re: re.Pattern[str] = re.compile(
+        rf"({_JINJA_TAG_CLOSE_RE})({_JINJA_TAG_OPEN_RE})|"
+        rf"({_JINJA_COMMENT_CLOSE_RE})({_JINJA_COMMENT_OPEN_RE})|"
+        rf"({_JINJA_VAR_CLOSE_RE})({_JINJA_VAR_OPEN_RE})|"
+        rf"({_HTML_COMMENT_CLOSE_RE})({_HTML_COMMENT_OPEN_RE})"
+    )
+
+    def _normalize_adjacent_tags(self, text: str) -> str:
+        """
+        Add a space between adjacent tags so they become separate tokens.
+        For example: %}{% becomes %} {%
+        """
+
+        def add_space(match: re.Match[str]) -> str:
+            # Find which group matched and insert space between closing and opening
+            groups = match.groups()
+            for i in range(0, len(groups), 2):
+                if groups[i] is not None:
+                    return groups[i] + " " + groups[i + 1]
+            return match.group(0)
+
+        return self._adjacent_tags_re.sub(add_space, text)
+
     def __call__(self, text: str) -> list[str]:
+        # First normalize adjacent tags to ensure proper tokenization
+        text = self._normalize_adjacent_tags(text)
+
         words = text.split()
         result: list[str] = []
         i = 0
Original file line number	Diff line number	Diff line change
`@@ -16,10 +16,10 @@`
`16`	`16`	`from typing_extensions import override`
`17`	`17`
`18`	`18`	`from flowmark.linewrapping.line_wrappers import (`
`19`		`- LineWrapper,`
`20`	`19`	`line_wrap_by_sentence,`
`21`	`20`	`line_wrap_to_width,`
`22`	`21`	`)`
	`22`	`+from flowmark.linewrapping.protocols import LineWrapper`
`23`	`23`	`from flowmark.linewrapping.text_filling import DEFAULT_WRAP_WIDTH`
`24`	`24`
`25`	`25`