Skip to content

Commit 9b30914

Browse files
authored
Merge pull request #16 from jlevy/feature/tag-formatting-edge-cases
Fix smart quotes to not apply inside Markdoc/Jinja/HTML tags
2 parents 1c3c561 + 0a5781e commit 9b30914

File tree

4 files changed

+523
-46
lines changed

4 files changed

+523
-46
lines changed

src/flowmark/linewrapping/tag_handling.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,20 @@
4747
HTML_COMMENT_CLOSE_RE = r"-->"
4848

4949

50+
# Pattern to match complete template tags (for protecting content inside tags).
51+
# These tags can span multiple lines and may contain quotes in attribute values.
52+
# Uses DOTALL so . matches newlines within tags.
53+
# Note: In VERBOSE mode, # starts a comment, so we use [#] for literal hash.
54+
TEMPLATE_TAG_PATTERN: re.Pattern[str] = re.compile(
55+
r"""
56+
\{%.*?%\} # Jinja/Markdoc template tags
57+
| \{[#].*?[#]\} # Jinja comments (use [#] to avoid VERBOSE comment)
58+
| \{\{.*?\}\} # Jinja variables
59+
| <!--.*?--> # HTML comments
60+
""",
61+
re.VERBOSE | re.DOTALL,
62+
)
63+
5064
# Pattern to detect adjacent tags (closing tag immediately followed by opening tag)
5165
# This handles cases like %}{% or --><!-- where there's no space between
5266
_adjacent_tags_re: re.Pattern[str] = re.compile(
Lines changed: 86 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import re
22
from re import Match, Pattern
33

4+
from flowmark.linewrapping.tag_handling import TEMPLATE_TAG_PATTERN
5+
46
# Precompiled regex patterns
57
PARAGRAPH_BREAK_PATTERN: Pattern[str] = re.compile(r"\n\s*\n")
68

@@ -19,49 +21,15 @@ def is_multi_paragraph(text: str) -> bool:
1921
return PARAGRAPH_BREAK_PATTERN.search(text) is not None
2022

2123

22-
def smart_quotes(text: str) -> str:
23-
r"""
24-
Replace straight ASCII quotes and apostrophes with typographic quotes and apostrophes
25-
when this can be done safely. Aims to be conservative so it doesn't break code or
26-
things that aren't language.
27-
28-
Text that is wrapped in single or double quotes is replaced with typographic quotes
29-
if it has whitespace or a newline at the front and is followed by whitespace or
30-
a [.,?!]. The content inside quotes must not contain any of the same type (single
31-
or double). Quotes containing paragraph breaks (two newlines) are left unchanged.
32-
33-
Straight quotes are converted to apostrophes if they are the only straight quote
34-
in the word, and have word characters on both sides:
35-
36-
I'm there with "George" -> I’m there with “George”
37-
"Hello," he said. -> “Hello,” he said.
38-
"I know!" -> “I know!”
39-
40-
Words in 'single quotes' work too -> Words in 'single quotes' work too
41-
42-
I'm there -> I’m there
43-
I'll be there, don't worry -> I’ll be there, don’t worry
44-
X is 'foo' -> X is ‘foo’
45-
46-
A few special rules to better help with English:
47-
48-
Jill's -> Jill’s
49-
James' -> James’
50-
51-
Other patterns are unchanged:
52-
53-
x="foo" -> x="foo"
54-
x='foo' -> x='foo'
55-
Blah'blah'blah -> Blah'blah'blah
56-
""quotes"s -> ""quotes"s
57-
\"escaped\" -> \"escaped\"
58-
'apos -> 'apos
59-
'apos'trophes -> 'apos'trophes
60-
$James' -> $James'
24+
def _apply_smart_quotes_to_text(text: str) -> str:
25+
"""
26+
Apply smart quote conversion to a text segment.
6127
28+
This is the core smart quotes logic, applied only to text that is NOT inside
29+
template tags.
6230
"""
6331

64-
# First handle quoted text - both single and double quotes
32+
# Handle quoted text - both single and double quotes
6533
def replace_quotes(match: Match[str]) -> str:
6634
prefix = match.group(1)
6735
double_content = match.group(2) # Content of double quotes
@@ -83,14 +51,11 @@ def replace_quotes(match: Match[str]) -> str:
8351

8452
result = QUOTE_PATTERN.sub(replace_quotes, text)
8553

86-
# Now handle apostrophes/contractions
54+
# Handle apostrophes/contractions
8755
# Only convert single quotes that are:
8856
# 1. The only quote in the word
8957
# 2. Have word characters on both sides OR are possessives at end of words ending in s/S
9058

91-
# Pattern for apostrophes: word char + ' + word char, where ' is the only quote in the word
92-
# We need to be careful not to match words that have multiple quotes
93-
9459
# Split by whitespace to process words individually
9560
words = re.split(r"(\s+)", result)
9661

@@ -115,3 +80,80 @@ def replace_quotes(match: Match[str]) -> str:
11580
words[i] = re.sub(r"\'", "\u2019", word)
11681

11782
return "".join(words)
83+
84+
85+
def smart_quotes(text: str) -> str:
86+
r"""
87+
Replace straight ASCII quotes and apostrophes with typographic quotes and apostrophes
88+
when this can be done safely. Aims to be conservative so it doesn't break code or
89+
things that aren't language.
90+
91+
IMPORTANT: Quotes inside template tags (Jinja/Markdoc `{% %}`, `{# #}`, `{{ }}`,
92+
and HTML comments `<!-- -->`) are NEVER converted, as this would break template
93+
syntax.
94+
95+
Text that is wrapped in single or double quotes is replaced with typographic quotes
96+
if it has whitespace or a newline at the front and is followed by whitespace or
97+
a [.,?!]. The content inside quotes must not contain any of the same type (single
98+
or double). Quotes containing paragraph breaks (two newlines) are left unchanged.
99+
100+
Straight quotes are converted to apostrophes if they are the only straight quote
101+
in the word, and have word characters on both sides:
102+
103+
I'm there with "George" -> I’m there with “George”
104+
"Hello," he said. -> “Hello,” he said.
105+
"I know!" -> “I know!”
106+
107+
Words in 'single quotes' work too -> Words in 'single quotes' work too
108+
109+
I'm there -> I’m there
110+
I'll be there, don't worry -> I’ll be there, don’t worry
111+
X is 'foo' -> X is ‘foo’
112+
113+
A few special rules to better help with English:
114+
115+
Jill's -> Jill’s
116+
James' -> James’
117+
118+
Other patterns are unchanged:
119+
120+
x="foo" -> x="foo"
121+
x='foo' -> x='foo'
122+
Blah'blah'blah -> Blah'blah'blah
123+
""quotes"s -> ""quotes"s
124+
\"escaped\" -> \"escaped\"
125+
'apos -> 'apos
126+
'apos'trophes -> 'apos'trophes
127+
$James' -> $James'
128+
129+
Template tag content is never modified:
130+
131+
{% field kind="string" %} -> {% field kind="string" %}
132+
{{ variable }} -> {{ variable }}
133+
{# comment "here" #} -> {# comment "here" #}
134+
<!-- html kind="comment" --> -> <!-- html kind="comment" -->
135+
136+
"""
137+
# Split text into segments: template tags (protected) and regular text.
138+
# We apply smart quotes only to regular text segments.
139+
segments: list[str] = []
140+
last_end = 0
141+
142+
for match in TEMPLATE_TAG_PATTERN.finditer(text):
143+
start, end = match.span()
144+
145+
# Add the text before this tag (apply smart quotes to it)
146+
if start > last_end:
147+
before_text = text[last_end:start]
148+
segments.append(_apply_smart_quotes_to_text(before_text))
149+
150+
# Add the tag itself unchanged
151+
segments.append(match.group(0))
152+
last_end = end
153+
154+
# Add any remaining text after the last tag
155+
if last_end < len(text):
156+
remaining = text[last_end:]
157+
segments.append(_apply_smart_quotes_to_text(remaining))
158+
159+
return "".join(segments)

0 commit comments

Comments
 (0)