fix: codeblock without "<" consumes extra char

justinmk · justinmk · commit 443f2f5a7d07 · 2022-09-30T12:51:00.000+02:00
Problem:
If a codeblock does not have a terminating "&lt;" char, it consumes the
first char of the next token.

Solution:
Define (codeblock) only in terms of its lines; it doesn't need to look
for its "end". Instead, add $._line_end_codeblock to the list of things
that can terminate a (block).
diff --git a/README.md b/README.md
@@ -7,48 +7,46 @@ well-formed; the _input_ (vimdoc) is secondary. The first step should always be
 to try to fix the input (within reason) rather than insist on a grammar that
 handles vimdoc's endless quirks.
 
-Notes
------
+Overview
+--------
 
 - vimdoc format "spec":
     - [:help help-writing](https://neovim.io/doc/user/helphelp.html#help-writing)
     - https://github.com/nanotee/vimdoc-notes
-- whitespace is intentionally captured in `(word)`, because it is often necessary to be
-  able to correctly layout vim help files (especially old/legacy).
-- `(codeblock)` is contained by `(line)` because `>` can start a code block at the end of a line.
-- `(column_heading)` is contained by `(line)` because `>` (to close
-  a `(codeblock)` can appear at the start of `(column_heading)`.
-- `h1` ("Heading 1"): `======` followed by text and optional `*tags*`.
-- `h2` ("Heading 2"): `------` followed by text and optional `*tags*`.
-- `h3` ("Heading 3"): only UPPERCASE WORDS, followed by optional `*tags*`.
+- whitespace is intentionally captured in all atoms, because it is often used
+  for "layout" and ascii art in legacy help files.
+- `block` is the main top-level node which contains `line` nodes.
+  - ends at blank line(s) or a line starting with `<`.
+- `line`:
+  - contains atoms (words, tags, taglinks, …).
+  - contains `codeblock` because `>` can start a codeblock at the end of a line.
+  - contains `column_heading` because `<` (the `codeblock` terminating char)
+    can appear at the start of a `column_heading`.
+- `codeblock`:
+  - contains `line` nodes which do not contain `word` nodes, it's just the full
+    raw text line including whitespace. This is somewhat dictated by its
+    "preformatted" nature; parsing the contents would require loading a "child"
+    language (injection). See [#2](https://github.com/neovim/tree-sitter-vimdoc/issues/2).
+  - the terminating `<` (and any following whitespace) is discarded (anonymous).
+- `h1` = "Heading 1": `======` followed by text and optional `*tags*`.
+- `h2` = "Heading 2": `------` followed by text and optional `*tags*`.
+- `h3` = "Heading 3": only UPPERCASE WORDS, followed by optional `*tags*`.
 
 Known issues
 ------------
 
-- `line_li` ("list item") is _experimental_. It doesn't support nesting yet and
-  it may not work well; you can treat it as a normal `line` for layout purposes.
-- `codeblock` ">" must not be preceded only by tabs, a space char is required (" >").
-  See `:help lcs-tab` for example. Currently the grammar doesn't enforce this.
-- `codeblock` terminated by an "implicit stop" (i.e. no terminating `<`)
-  consumes the first char of the terminating line, and continues the parent
-  `block`, preventing top-level forms like `h1`, `h2` from being recognized
-  until a blank line is encountered.
-- `line` in a `codeblock` does not contain `word` atoms, it's just the full
-  raw text line including whitespace. This is somewhat dictated by its
-  "preformatted" nature; parsing the contents would require loading a "child"
-  language (injection). See [#2](https://github.com/vigoux/tree-sitter-vimdoc/issues/2).
+- `line_li` ("list item") is experimental. It doesn't support nesting yet.
+- Spec requires that `codeblock` delimiter ">" must be preceded by a space
+  (" >"), not a tab. But currently the grammar doesn't enforce this. Example:
+  `:help lcs-tab`.
+- `codeblock` terminated by an "implicit stop" (no terminating `<`) consumes
+  blank lines, preventing top-level forms like `h1` from being recognized.
 - `url` doesn't handle _surrounding_ parens. E.g. `(https://example.com/#yay)` yields `word`
 - `url` doesn't handle _nested_ parens. E.g. `(https://example.com/(foo)#yay)`
-- Ideally `block_end` should consume the last block of the document _only_ if that
-  block is missing a trailing blank line or EOL ("\n").
-    - TODO: consider simply _not supporting_ docs without EOL?
-- Ideally `line_noeol` should consume the last line of the document _only_ if
-  that line is missing EOL ("\n").
-    - TODO: consider simply _not supporting_ docs without EOL?
 
 TODO
 ----
 
 - `line_noeol` is a special-case to support documents that don't end in EOL.
-  Grammar could be a bit simpler if we just require EOL at end of document.
-- `line_modeline` (only at EOF)
+  Grammar could be simpler if we require EOL at end of document.
+- `line_modeline` ?
diff --git a/corpus/arguments.txt b/corpus/arguments.txt
@@ -43,7 +43,10 @@ NOT an argument
     (line
       (argument
         (word)
-        (ERROR))
+        (MISSING "}"))
+      (word)
+      (argument
+        (word))
       (word)
       (codespan
         (word))
diff --git a/corpus/codeblock.txt b/corpus/codeblock.txt
@@ -28,7 +28,8 @@ block3:
       (word))
     (line
       (codeblock
-        (line)))
+        (line))))
+  (block
     (line
       (word)))
   (block
@@ -100,6 +101,9 @@ codeblock with implicit stop (FIXME)
 
 -------------------------------
 h1-headline *foo*
+line1
+
+line2
 
 -------------------------------
 h1-headline *foo*
@@ -118,7 +122,12 @@ h1-headline *foo*
     (line
       (word)
       (tag
-        (word))))
+        (word)))
+    (line
+      (word)))
+  (block
+    (line
+      (word)))
   (h2
     (word)
     (tag
@@ -155,7 +164,9 @@ x
         (line)
         (line)
         (line)
-        (line)))))
+        (line)))
+    (line
+      (word))))
 
 ================================================================================
 tricky codeblock
@@ -166,7 +177,17 @@ tricky codeblock
   < line3
 <
 
+    Example: >
+
+        vim.spell.check()
+        -->
+        {
+            {'quik', 'bad', 4}
+        }
+<
+
 tricky
+
 --------------------------------------------------------------------------------
 
 (help_file
@@ -176,6 +197,16 @@ tricky
         (line)
         (line)
         (line))))
+  (block
+    (line
+      (word)
+      (codeblock
+        (line)
+        (line)
+        (line)
+        (line)
+        (line)
+        (line))))
   (block
     (line
       (word))))
@@ -243,3 +274,51 @@ To test for a non-empty string, use empty(): >
       (word)
       (codeblock
         (line)))))
+
+================================================================================
+codeblock stop and start on same line
+================================================================================
+    Examples: >
+        :lua vim.api.nvim_command('echo "Hello, Nvim!"')
+<    LuaJIT: >
+        :lua =jit.version
+<
+    *:lua-heredoc*
+:lua << [endmarker]
+{script}
+
+    Example: >
+        lua << EOF
+        EOF
+<
+
+--------------------------------------------------------------------------------
+
+(help_file
+  (block
+    (line
+      (word)
+      (codeblock
+        (line))))
+  (block
+    (line
+      (word)
+      (codeblock
+        (line))))
+  (block
+    (line
+      (tag
+        (word)))
+    (line
+      (word)
+      (word)
+      (word))
+    (line
+      (argument
+        (word))))
+  (block
+    (line
+      (word)
+      (codeblock
+        (line)
+        (line)))))
diff --git a/corpus/codespan.txt b/corpus/codespan.txt
@@ -46,11 +46,10 @@ an error`.
       (word))))
 
 ================================================================================
-NOT a codespan
+NOT codespan
 ================================================================================
-						*'* *'a* *`* *`a*
-'{a-z}  `{a-z}		Jump to the mark.
-						*g'* *g'a* *g`* *g`a*
+				*'* *'a* *`* *`a*
+				*g'* *g'a* *g`* *g`a*
 g'{mark}  g`{mark}
 
 --------------------------------------------------------------------------------
@@ -66,14 +65,6 @@ g'{mark}  g`{mark}
         (word))
       (tag
         (word)))
-    (ERROR)
-    (line
-      (argument
-        (word))
-      (word)
-      (word)
-      (word)
-      (word))
     (line
       (tag
         (word))
diff --git a/corpus/heading3-column_heading.txt b/corpus/heading3-column_heading.txt
@@ -133,7 +133,8 @@ column_heading should NOT parse atoms (links, tags, etc.) (FIXME)
     (line
       (word)
       (codeblock
-        (line)))
+        (line))))
+  (block
     (line
       (column_heading
         (optionlink
diff --git a/corpus/optionlink.txt b/corpus/optionlink.txt
@@ -29,7 +29,7 @@ world 'hello' world
       (word))))
 
 ================================================================================
-NOT an optionlink #7 #14
+NOT optionlink #7 #14
 ================================================================================
 
 Let's see if that works.
@@ -85,3 +85,14 @@ number: '04' 'ISO-10646-1' 'python3'
       (word)
       (word)
       (word))))
+
+================================================================================
+NOT optionlink (FIXME)
+================================================================================
+
+'{a-z}  `{a-z}		Jump to the mark.
+
+--------------------------------------------------------------------------------
+
+(help_file
+  (ERROR))
diff --git a/grammar.js b/grammar.js
@@ -1,3 +1,6 @@
+// https://tree-sitter.github.io/tree-sitter/creating-parsers
+// - Rules starting with underscore are hidden in the syntax tree.
+
 const _uppercase_word = /[A-Z0-9.()][-A-Z0-9.()_]+/;
 
 module.exports = grammar({
@@ -70,39 +73,42 @@ module.exports = grammar({
     )),
 
     // Text block/paragraph: adjacent lines followed by blank line(s).
-    block: ($) => prec.right(seq(
-        repeat1(choice($.line, $.line_li)),
-        repeat1(_blank()),
-      )
+    block: ($) => seq(
+      repeat1(choice($.line, $.line_li)),
+      choice(
+        token.immediate('<'),  // Eat codeblock terminating char.
+        $._blank),
+      repeat($._blank),
     ),
     // Special case: last block in the document may not end with blank line (nor even EOL).
-    block_end: ($) => prec.right(choice(
+    block_end: ($) => choice(
       choice(
         alias($.line_noeol, $.line),
         alias($.line_li_noeol, $.line_li)),
       seq(repeat1(choice($.line, $.line_li)),
         choice(
           alias($.line_noeol, $.line),
-          alias($.line_li_noeol, $.line_li))))
+          alias($.line_li_noeol, $.line_li)))
     ),
 
-    // Code block: preformatted lines delimited by ">" and "<".
+    // Codeblock: preformatted block of lines starting with ">".
     codeblock: ($) => prec.right(seq(
       />[\t ]*\n/,
       repeat1(alias($.line_code, $.line)),
-      // Code block ends if a line starts with "<" or a non-empty line starts with a visible char.
-      token.immediate(choice(/<[\t ]*\n/, /[^\t\n ]/)),
+      // Codeblock ends if a line starts with non-whitespace.
+      // The terminating "<" is consumed in other rules.
     )),
 
     // Lines.
+    _blank: () => field('blank', /[\t ]*\n/),
     line: ($) => _line($, true),
     line_noeol: ($) => _line($, false),
     // Listitem line.
     line_li: ($) => seq(/[-*+•][ ]+/, repeat1($._atom), '\n'),
     line_li_noeol: ($) => seq(/[-*+•][ ]+/, repeat1($._atom)),
     // Codeblock lines: must be indented by at least 1 space/tab.
     // Line content (incl. whitespace) is captured as a single atom.
-    line_code: () => choice('\n', seq(/[\t ]+[^\n]+/, /\n/)),
+    line_code: () => choice('\n', /[\t ]+[^\n]+\n/),
 
     // "Column heading": plaintext followed by "~".
     // Intended for table column names per `:help help-writing`.
@@ -117,15 +123,15 @@ module.exports = grammar({
         token.immediate(field('delimiter', /============+[\t ]*\n/)),
         repeat1($._atom),
         '\n',
-        repeat(_blank()),
+        repeat($._blank),
       ),
 
     h2: ($) =>
       seq(
         token.immediate(field('delimiter', /------------+[\t ]*\n/)),
         repeat1($._atom),
         '\n',
-        repeat(_blank()),
+        repeat($._blank),
       ),
 
     // Heading 3: UPPERCASE NAME, followed by optional *tags*.
@@ -134,7 +140,7 @@ module.exports = grammar({
         field('name', $.uppercase_name),
         repeat($.tag),
         '\n',
-        repeat(_blank()),
+        repeat($._blank),
       ),
 
     tag: ($) => _word($,
@@ -185,7 +191,3 @@ function _line($, require_eol) {
     seq(optional($.uppercase_words), repeat1($._atom), choice($.codeblock, eol)),
   );
 }
-
-function _blank() {
-  return field('blank', /[\t ]*\n/);
-}