feat: support =begin/=end data regions and =for paragraphs

rabbiveesh · claude · rabbiveesh · commit 6932aefe1cac · 2026-03-22T15:23:46.000+02:00
Adds three new paragraph types to the grammar: - begin_paragraph: =begin format ... =end format (opaque data region) - for_paragraph: =for format content (single-paragraph data) - Dedicated begin_command, end_command, for_command, format_name nodes The scanner handles data sections by consuming everything between =begin and =end as opaque content, including lines that look like POD commands (e.g. =head1 inside a data region). The =end detector requires "=end" followed by whitespace/newline/EOF, so lines like "=ending" inside data are not falsely treated as terminators. Empty =begin/=end blocks (no content) are supported via zero-length data section tokens. Closes #8 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/grammar.js b/grammar.js
@@ -9,11 +9,14 @@ module.exports = grammar({
     $._intseq_letter,
     $._intseq_start,
     $._intseq_end,
+    $._data_section,
   ],
   rules: {
     pod: $ => repeat(choice(
       $.pod_paragraph,
 
+      $.begin_paragraph,
+      $.for_paragraph,
       $.command_paragraph,
 
       $.plain_paragraph,
@@ -30,6 +33,23 @@ module.exports = grammar({
     pod_paragraph: $ => seq($._start_command, $.pod_command, $._eol),
     pod_command: $ => '=pod',
 
+    begin_paragraph: $ => seq(
+      $._start_command, $.begin_command, /[ \t]+/, field('format', $.format_name), $._eol,
+      alias($._data_section, $.data),
+      $._start_command, $.end_command, /[ \t]+/, $.format_name, $._eol,
+    ),
+    begin_command: $ => '=begin',
+    end_command: $ => '=end',
+
+    for_paragraph: $ => seq(
+      $._start_command, $.for_command, /[ \t]+/, field('format', $.format_name),
+      optional(seq(/[ \t]+/, alias($._content_plain, $.content))),
+      $._eol,
+    ),
+    for_command: $ => '=for',
+
+    format_name: $ => /[a-zA-Z:]\S*/,
+
     // \s includes linefeed; tree-sitter doesn't seem to recognise \h for "horizontal whitespace
     command_paragraph: $ => seq($._start_command, field('command', $.command), /[ \t]*/, optional($.content), $._eol),
     command: $ => token(/=[a-zA-Z]\S*/),
diff --git a/queries/highlights.scm b/queries/highlights.scm
@@ -2,7 +2,10 @@
 
 [(pod_command)
  (command)
- (cut_command)] @keyword
+ (cut_command)
+ (begin_command)
+ (end_command)
+ (for_command)] @keyword
 
 (command_paragraph
   (command) @keyword
@@ -31,6 +34,10 @@
 
 (verbatim_paragraph (content) @text.literal)
 
+(begin_paragraph (format_name) @string.special)
+(for_paragraph (format_name) @string.special)
+(begin_paragraph (data) @text.literal)
+
 (interior_sequence
   (sequence_letter) @character
   ["<" ">"] @punctuation.delimiter
diff --git a/src/scanner.c b/src/scanner.c
@@ -44,6 +44,7 @@ enum TokenType {
   TOKEN_INTSEQ_LETTER,
   TOKEN_INTSEQ_START,
   TOKEN_INTSEQ_END,
+  TOKEN_DATA_SECTION,
 };
 
 #define MAX_NESTED_CHEVRONS 8
@@ -55,9 +56,7 @@ struct LexerState {
 
 void *tree_sitter_pod_external_scanner_create()
 {
-  struct LexerState *state = malloc(sizeof(struct LexerState));
-
-  state->nchevrons = 0;
+  struct LexerState *state = calloc(1, sizeof(struct LexerState));
 
   return state;
 }
@@ -109,6 +108,24 @@ static void chevron_count_pop(struct LexerState *state)
   state->nchevrons--;
 }
 
+/* Check if the lexer is at a line starting with "=end" followed by
+ * whitespace, newline, or EOF. Peeks ahead without affecting mark_end.
+ * Caller must call mark_end before this if they want to preserve position. */
+static bool at_end_command(TSLexer *lexer)
+{
+  if(lexer->lookahead != '=') return false;
+  lexer->advance(lexer, false);
+  if(lexer->lookahead != 'e') return false;
+  lexer->advance(lexer, false);
+  if(lexer->lookahead != 'n') return false;
+  lexer->advance(lexer, false);
+  if(lexer->lookahead != 'd') return false;
+  lexer->advance(lexer, false);
+  /* =end must be followed by whitespace, newline, or EOF */
+  int next = lexer->lookahead;
+  return next == ' ' || next == '\t' || next == '\n' || next == '\r' || lexer->eof(lexer);
+}
+
 bool tree_sitter_pod_external_scanner_scan(
   void *payload,
   TSLexer *lexer,
@@ -139,6 +156,53 @@ bool tree_sitter_pod_external_scanner_scan(
   if(lexer->eof(lexer))
     return false;
 
+  /* Data section: consume everything until =end at column 0.
+   * Always emits TOKEN_DATA_SECTION (possibly zero-length for empty
+   * =begin/=end blocks). Must be checked before TOKEN_START_COMMAND
+   * since the parser expects _data_section first inside begin_paragraph. */
+  if(valid_symbols[TOKEN_DATA_SECTION]) {
+    lexer->mark_end(lexer); /* mark start position for potential zero-length token */
+    bool at_bol = true; /* we start right after _eol, so at beginning of line */
+
+    while(!lexer->eof(lexer)) {
+      c = lexer->lookahead;
+
+      /* At start of a line, check for =end */
+      if(at_bol && c == '=') {
+        lexer->mark_end(lexer);
+        if(at_end_command(lexer)) {
+          /* Found =end — return data up to here (may be zero-length) */
+          TOKEN(TOKEN_DATA_SECTION);
+        }
+        /* Not =end, continue consuming (advance already moved past =xxx) */
+        at_bol = false;
+        continue;
+      }
+
+      at_bol = false;
+
+      if(c == '\n') {
+        lexer->advance(lexer, false);
+        at_bol = true;
+        continue;
+      }
+      if(c == '\r') {
+        lexer->advance(lexer, false);
+        if(lexer->lookahead == '\n') {
+          lexer->advance(lexer, false);
+        }
+        at_bol = true;
+        continue;
+      }
+
+      lexer->advance(lexer, false);
+    }
+
+    /* EOF without =end — return whatever we consumed */
+    lexer->mark_end(lexer);
+    TOKEN(TOKEN_DATA_SECTION);
+  }
+
   if(valid_symbols[TOKEN_START_COMMAND] ||
      valid_symbols[TOKEN_START_PLAIN] ||
      valid_symbols[TOKEN_START_VERBATIM]) {
diff --git a/test/corpus/data-regions b/test/corpus/data-regions
@@ -0,0 +1,103 @@
+==========
+Basic begin/end
+==========
+=begin html
+
+<p>Hello world</p>
+
+=end html
+----------
+(pod
+  (begin_paragraph
+    (begin_command) (format_name)
+    (data)
+    (end_command) (format_name)))
+==========
+Empty begin/end
+==========
+=begin html
+=end html
+----------
+(pod
+  (begin_paragraph
+    (begin_command) (format_name)
+    (data)
+    (end_command) (format_name)))
+==========
+Begin/end with command-like content
+==========
+=begin text
+
+=head1 This is not a command
+=over 4
+Just data
+
+=end text
+----------
+(pod
+  (begin_paragraph
+    (begin_command) (format_name)
+    (data)
+    (end_command) (format_name)))
+==========
+For paragraph
+==========
+=for html <b>bold text</b>
+----------
+(pod
+  (for_paragraph
+    (for_command) (format_name) (content)))
+==========
+For paragraph without content
+==========
+=for comment
+----------
+(pod
+  (for_paragraph
+    (for_command) (format_name)))
+==========
+Begin/end does not consume past =end
+==========
+=begin html
+
+<p>data</p>
+
+=end html
+
+=head1 After
+----------
+(pod
+  (begin_paragraph
+    (begin_command) (format_name)
+    (data)
+    (end_command) (format_name))
+  (command_paragraph (command) (content)))
+==========
+Begin/end with colon format
+==========
+=begin :html
+
+<p>data</p>
+
+=end :html
+----------
+(pod
+  (begin_paragraph
+    (begin_command) (format_name)
+    (data)
+    (end_command) (format_name)))
+==========
+Not confused by =ending inside data
+==========
+=begin text
+
+=ending is not =end
+more data
+
+=end text
+----------
+(pod
+  (begin_paragraph
+    (begin_command) (format_name)
+    (data)
+    (end_command) (format_name)))
diff --git a/test/corpus/interior-sequences b/test/corpus/interior-sequences
@@ -91,9 +91,9 @@ C<
 ; nature of the errors might be fragile, so this test may need changing in
 ; future.
 (pod
-  (plain_paragraph
-    (content (interior_sequence (sequence_letter) (MISSING ">"))))
-  (ERROR (sequence_letter)))
+  (ERROR
+    (sequence_letter)
+    (format_name)))
 ==========
 Incomplete sequence does not loop forever
 ==========