Skip to content

Commit 6932aef

Browse files
rabbiveeshclaude
andcommitted
feat: support =begin/=end data regions and =for paragraphs
Adds three new paragraph types to the grammar: - begin_paragraph: =begin format ... =end format (opaque data region) - for_paragraph: =for format content (single-paragraph data) - Dedicated begin_command, end_command, for_command, format_name nodes The scanner handles data sections by consuming everything between =begin and =end as opaque content, including lines that look like POD commands (e.g. =head1 inside a data region). The =end detector requires "=end" followed by whitespace/newline/EOF, so lines like "=ending" inside data are not falsely treated as terminators. Empty =begin/=end blocks (no content) are supported via zero-length data section tokens. Closes #8 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 7124408 commit 6932aef

File tree

5 files changed

+201
-7
lines changed

5 files changed

+201
-7
lines changed

grammar.js

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,14 @@ module.exports = grammar({
99
$._intseq_letter,
1010
$._intseq_start,
1111
$._intseq_end,
12+
$._data_section,
1213
],
1314
rules: {
1415
pod: $ => repeat(choice(
1516
$.pod_paragraph,
1617

18+
$.begin_paragraph,
19+
$.for_paragraph,
1720
$.command_paragraph,
1821

1922
$.plain_paragraph,
@@ -30,6 +33,23 @@ module.exports = grammar({
3033
pod_paragraph: $ => seq($._start_command, $.pod_command, $._eol),
3134
pod_command: $ => '=pod',
3235

36+
begin_paragraph: $ => seq(
37+
$._start_command, $.begin_command, /[ \t]+/, field('format', $.format_name), $._eol,
38+
alias($._data_section, $.data),
39+
$._start_command, $.end_command, /[ \t]+/, $.format_name, $._eol,
40+
),
41+
begin_command: $ => '=begin',
42+
end_command: $ => '=end',
43+
44+
for_paragraph: $ => seq(
45+
$._start_command, $.for_command, /[ \t]+/, field('format', $.format_name),
46+
optional(seq(/[ \t]+/, alias($._content_plain, $.content))),
47+
$._eol,
48+
),
49+
for_command: $ => '=for',
50+
51+
format_name: $ => /[a-zA-Z:]\S*/,
52+
3353
// \s includes linefeed; tree-sitter doesn't seem to recognise \h for "horizontal whitespace
3454
command_paragraph: $ => seq($._start_command, field('command', $.command), /[ \t]*/, optional($.content), $._eol),
3555
command: $ => token(/=[a-zA-Z]\S*/),

queries/highlights.scm

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@
22

33
[(pod_command)
44
(command)
5-
(cut_command)] @keyword
5+
(cut_command)
6+
(begin_command)
7+
(end_command)
8+
(for_command)] @keyword
69

710
(command_paragraph
811
(command) @keyword
@@ -31,6 +34,10 @@
3134

3235
(verbatim_paragraph (content) @text.literal)
3336

37+
(begin_paragraph (format_name) @string.special)
38+
(for_paragraph (format_name) @string.special)
39+
(begin_paragraph (data) @text.literal)
40+
3441
(interior_sequence
3542
(sequence_letter) @character
3643
["<" ">"] @punctuation.delimiter

src/scanner.c

Lines changed: 67 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ enum TokenType {
4444
TOKEN_INTSEQ_LETTER,
4545
TOKEN_INTSEQ_START,
4646
TOKEN_INTSEQ_END,
47+
TOKEN_DATA_SECTION,
4748
};
4849

4950
#define MAX_NESTED_CHEVRONS 8
@@ -55,9 +56,7 @@ struct LexerState {
5556

5657
void *tree_sitter_pod_external_scanner_create()
5758
{
58-
struct LexerState *state = malloc(sizeof(struct LexerState));
59-
60-
state->nchevrons = 0;
59+
struct LexerState *state = calloc(1, sizeof(struct LexerState));
6160

6261
return state;
6362
}
@@ -109,6 +108,24 @@ static void chevron_count_pop(struct LexerState *state)
109108
state->nchevrons--;
110109
}
111110

111+
/* Check if the lexer is at a line starting with "=end" followed by
112+
* whitespace, newline, or EOF. Peeks ahead without affecting mark_end.
113+
* Caller must call mark_end before this if they want to preserve position. */
114+
static bool at_end_command(TSLexer *lexer)
115+
{
116+
if(lexer->lookahead != '=') return false;
117+
lexer->advance(lexer, false);
118+
if(lexer->lookahead != 'e') return false;
119+
lexer->advance(lexer, false);
120+
if(lexer->lookahead != 'n') return false;
121+
lexer->advance(lexer, false);
122+
if(lexer->lookahead != 'd') return false;
123+
lexer->advance(lexer, false);
124+
/* =end must be followed by whitespace, newline, or EOF */
125+
int next = lexer->lookahead;
126+
return next == ' ' || next == '\t' || next == '\n' || next == '\r' || lexer->eof(lexer);
127+
}
128+
112129
bool tree_sitter_pod_external_scanner_scan(
113130
void *payload,
114131
TSLexer *lexer,
@@ -139,6 +156,53 @@ bool tree_sitter_pod_external_scanner_scan(
139156
if(lexer->eof(lexer))
140157
return false;
141158

159+
/* Data section: consume everything until =end at column 0.
160+
* Always emits TOKEN_DATA_SECTION (possibly zero-length for empty
161+
* =begin/=end blocks). Must be checked before TOKEN_START_COMMAND
162+
* since the parser expects _data_section first inside begin_paragraph. */
163+
if(valid_symbols[TOKEN_DATA_SECTION]) {
164+
lexer->mark_end(lexer); /* mark start position for potential zero-length token */
165+
bool at_bol = true; /* we start right after _eol, so at beginning of line */
166+
167+
while(!lexer->eof(lexer)) {
168+
c = lexer->lookahead;
169+
170+
/* At start of a line, check for =end */
171+
if(at_bol && c == '=') {
172+
lexer->mark_end(lexer);
173+
if(at_end_command(lexer)) {
174+
/* Found =end — return data up to here (may be zero-length) */
175+
TOKEN(TOKEN_DATA_SECTION);
176+
}
177+
/* Not =end, continue consuming (advance already moved past =xxx) */
178+
at_bol = false;
179+
continue;
180+
}
181+
182+
at_bol = false;
183+
184+
if(c == '\n') {
185+
lexer->advance(lexer, false);
186+
at_bol = true;
187+
continue;
188+
}
189+
if(c == '\r') {
190+
lexer->advance(lexer, false);
191+
if(lexer->lookahead == '\n') {
192+
lexer->advance(lexer, false);
193+
}
194+
at_bol = true;
195+
continue;
196+
}
197+
198+
lexer->advance(lexer, false);
199+
}
200+
201+
/* EOF without =end — return whatever we consumed */
202+
lexer->mark_end(lexer);
203+
TOKEN(TOKEN_DATA_SECTION);
204+
}
205+
142206
if(valid_symbols[TOKEN_START_COMMAND] ||
143207
valid_symbols[TOKEN_START_PLAIN] ||
144208
valid_symbols[TOKEN_START_VERBATIM]) {

test/corpus/data-regions

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
==========
2+
Basic begin/end
3+
==========
4+
=begin html
5+
6+
<p>Hello world</p>
7+
8+
=end html
9+
----------
10+
(pod
11+
(begin_paragraph
12+
(begin_command) (format_name)
13+
(data)
14+
(end_command) (format_name)))
15+
==========
16+
Empty begin/end
17+
==========
18+
=begin html
19+
=end html
20+
----------
21+
(pod
22+
(begin_paragraph
23+
(begin_command) (format_name)
24+
(data)
25+
(end_command) (format_name)))
26+
==========
27+
Begin/end with command-like content
28+
==========
29+
=begin text
30+
31+
=head1 This is not a command
32+
=over 4
33+
Just data
34+
35+
=end text
36+
----------
37+
(pod
38+
(begin_paragraph
39+
(begin_command) (format_name)
40+
(data)
41+
(end_command) (format_name)))
42+
==========
43+
For paragraph
44+
==========
45+
=for html <b>bold text</b>
46+
----------
47+
(pod
48+
(for_paragraph
49+
(for_command) (format_name) (content)))
50+
==========
51+
For paragraph without content
52+
==========
53+
=for comment
54+
----------
55+
(pod
56+
(for_paragraph
57+
(for_command) (format_name)))
58+
==========
59+
Begin/end does not consume past =end
60+
==========
61+
=begin html
62+
63+
<p>data</p>
64+
65+
=end html
66+
67+
=head1 After
68+
----------
69+
(pod
70+
(begin_paragraph
71+
(begin_command) (format_name)
72+
(data)
73+
(end_command) (format_name))
74+
(command_paragraph (command) (content)))
75+
==========
76+
Begin/end with colon format
77+
==========
78+
=begin :html
79+
80+
<p>data</p>
81+
82+
=end :html
83+
----------
84+
(pod
85+
(begin_paragraph
86+
(begin_command) (format_name)
87+
(data)
88+
(end_command) (format_name)))
89+
==========
90+
Not confused by =ending inside data
91+
==========
92+
=begin text
93+
94+
=ending is not =end
95+
more data
96+
97+
=end text
98+
----------
99+
(pod
100+
(begin_paragraph
101+
(begin_command) (format_name)
102+
(data)
103+
(end_command) (format_name)))

test/corpus/interior-sequences

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -91,9 +91,9 @@ C<
9191
; nature of the errors might be fragile, so this test may need changing in
9292
; future.
9393
(pod
94-
(plain_paragraph
95-
(content (interior_sequence (sequence_letter) (MISSING ">"))))
96-
(ERROR (sequence_letter)))
94+
(ERROR
95+
(sequence_letter)
96+
(format_name)))
9797
==========
9898
Incomplete sequence does not loop forever
9999
==========

0 commit comments

Comments
 (0)