Skip to content

Commit 360ef13

Browse files
feat: upgrade tresitter and improve grammar
1 parent 7bc2bfc commit 360ef13

File tree

13 files changed

+7916
-5932
lines changed

13 files changed

+7916
-5932
lines changed

README.md

Lines changed: 42 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,46 @@
1-
tree-sitter-forth
2-
==================
1+
# tree-sitter-forth
32

43
[![build](https://github.com/AlexanderBrevig/tree-sitter-forth/actions/workflows/ci.yml/badge.svg)](https://github.com/AlexanderBrevig/tree-sitter-forth/actions/workflows/ci.yml)
54

6-
[Forth][] grammar for [tree-sitter][].
5+
[Forth](https://forth-standard.org/) grammar for [tree-sitter](https://github.com/tree-sitter/tree-sitter) - a fast, incremental parser for syntax highlighting and code analysis.
76

8-
[Forth]: https://forth-standard.org/
9-
[tree-sitter]: https://github.com/tree-sitter/tree-sitter
7+
## Features
8+
9+
- **Complete number support**: Decimal, hex (`0xFF`, `$FF`), binary (`%1010`), octal (`&77`), character literals (`'c'`), floats (`3.14`, `1.5e-10`), and double-cell numbers (`123.`)
10+
- **Semantic categorization**: Control flow, operators, I/O, and core words are distinct AST nodes for better syntax highlighting
11+
- **Case-insensitive**: All Forth keywords are case-insensitive as per standard
12+
- **Comment types**: Line comments (`\`), block comments, and stack effect comments (`( n -- result )`) parsed separately
13+
- **String types**: `s"`, `S"`, `c"`, `C"`, and `."` with proper tokenization
14+
- **Bindings**: Node.js and Rust support
15+
16+
## Installation
17+
18+
```bash
19+
npm install tree-sitter-forth
20+
```
21+
22+
## Usage
23+
24+
```bash
25+
# Generate parser
26+
npm run build
27+
28+
# Run tests
29+
npm test
30+
31+
# Parse a file
32+
npm run parse -- file.fs
33+
```
34+
35+
## Grammar Structure
36+
37+
The grammar categorizes Forth words into semantic groups:
38+
39+
- **Control Flow**: `IF/THEN/ELSE`, `BEGIN/UNTIL`, `DO/LOOP`, `CASE/OF`, etc.
40+
- **Operators**: Stack manipulation (`DUP`, `SWAP`, `ROT`), arithmetic (`+`, `-`, `MOD`), logic (`AND`, `OR`)
41+
- **I/O**: Input/output words (`.`, `EMIT`, `KEY`, `ACCEPT`)
42+
- **Core**: Defining words, memory operations, compilation words
43+
44+
## License
45+
46+
MIT

grammar.js

Lines changed: 85 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -7,26 +7,59 @@ function literal(keyword) {
77
return new RegExp(keyword, 'i');
88
};
99

10-
const builtin_oprs = ["=", "+", "-", "/", "*", "*/", ">", "<"];
11-
12-
const builtin_core = ["include", "!", "#", "#>", "#S", "'", "*/MOD", "+!", "+LOOP",
13-
",", ".", ".\"", "/MOD", "0<", "0=", "1+", "1-", "2!", "2*", "2/", "2@", "2DROP",
14-
"2DUP", "2OVER", "2SWAP", "<#", ">BODY", ">IN", ">NUMBER", ">R", "?DUP", "@",
15-
"ABORT", "ABORT\"", "ABS", "ACCEPT", "ALIGN", "ALIGNED", "ALLOT", "AND", "BASE",
16-
"BEGIN", "BL", "C!", "C,", "C@", "CELL+", "CELLS", "CHAR", "CHAR+", "CHARS",
17-
"CONSTANT", "COUNT", "CR", "CREATE", "DECIMAL", "DEPTH", "DO", "DOES>", "DROP",
18-
"DUP", "ELSE", "EMIT", "ENVIRONMENT?", "EVALUATE", "EXECUTE", "EXIT", "FILL",
19-
"FIND", "FM/MOD", "HERE", "HOLD", "I", "IF", "IMMEDIATE", "INVERT", "J", "KEY",
20-
"LEAVE", "LOOP", "LSHIFT", "M*", "MAX", "MIN", "MOD", "MOVE", "NEGATE", "OR",
21-
"OVER", "POSTPONE", "QUIT", "R>", "R@", "RECURSE", "REPEAT", "ROT", "RSHIFT",
22-
"S\"", "S>D", "SIGN", "SM/REM", "SOURCE", "SPACE", "SPACES", "STATE", "SWAP",
23-
"THEN", "TYPE", "U.", "U<", "UM*", "UM/MOD", "UNLOOP", "UNTIL", "VARIABLE",
24-
"WHILE", "WORD", "XOR", "[", "[']", "[CHAR]", ".(\"", ".R", ".S", "0<>", "0>",
25-
"2>R", "2R>", "2R@", ":NONAME", "<>", "?DO", "ACTION-OF", "AGAIN", "BUFFER:",
26-
"C\"", "CASE", "COMPILE,", "DEFER", "DEFER!", "DEFER@", "ENDCASE", "ENDOF",
27-
"ERASE", "FALSE", "HEX", "HOLDS", "IS", "MARKER", "NIP", "OF", "PAD", "PARSE",
28-
"PARSE-NAME", "PICK", "REFILL", "RESTORE-INPUT", "ROLL", "S\\\"", "SAVE-INPUT",
29-
"SOURCE-ID", "TO", "TRUE", "TUCK", "U.R", "U>", "UNUSED", "VALUE", "WITHIN", "[COMPILE]" ];
10+
const builtin_oprs = [
11+
// Arithmetic operators
12+
"=", "+", "-", "/", "*", "*/", ">", "<",
13+
"MOD", "/MOD", "*/MOD", "FM/MOD", "SM/REM",
14+
"1+", "1-", "2*", "2/",
15+
"M*", "UM*", "UM/MOD",
16+
"ABS", "NEGATE", "MIN", "MAX",
17+
// Logical operators
18+
"AND", "OR", "XOR", "INVERT",
19+
"LSHIFT", "RSHIFT",
20+
// Comparison operators
21+
"<>", "0<", "0=", "0>", "0<>",
22+
"U<", "U>",
23+
// Stack manipulation
24+
"DUP", "DROP", "SWAP", "OVER", "ROT", "NIP", "TUCK", "PICK", "ROLL",
25+
"2DUP", "2DROP", "2SWAP", "2OVER",
26+
">R", "R>", "R@",
27+
"2>R", "2R>", "2R@",
28+
"?DUP"
29+
];
30+
31+
const control_flow = ["IF", "THEN", "ELSE", "BEGIN", "UNTIL", "WHILE", "REPEAT",
32+
"AGAIN", "DO", "LOOP", "+LOOP", "?DO", "UNLOOP", "LEAVE", "EXIT", "RECURSE",
33+
"CASE", "OF", "ENDOF", "ENDCASE"];
34+
35+
const io_words = [
36+
// Output
37+
".", "EMIT", "TYPE", "SPACE", "SPACES", "CR", ".\"", ".S", ".R", "U.", "U.R",
38+
// Input
39+
"KEY", "ACCEPT", "WORD",
40+
// String parsing
41+
"PARSE", "PARSE-NAME",
42+
// Formatting
43+
".(\"",
44+
];
45+
46+
const builtin_core = ["include", "!", "#", "#>", "#S", "'", "+!",
47+
",", "2!", "2@",
48+
"<#", ">BODY", ">IN", ">NUMBER", "@",
49+
"ABORT", "ABORT\"", "ALIGN", "ALIGNED", "ALLOT", "BASE",
50+
"BL", "C!", "C,", "C@", "CELL+", "CELLS", "CHAR", "CHAR+", "CHARS",
51+
"CONSTANT", "COUNT", "CREATE", "DECIMAL", "DEPTH", "DOES>",
52+
"ENVIRONMENT?", "EVALUATE", "EXECUTE", "FILL",
53+
"FIND", "HERE", "HOLD", "I", "IMMEDIATE", "J",
54+
"MOVE", "POSTPONE", "QUIT",
55+
"S\"", "S>D", "SIGN", "SOURCE", "STATE",
56+
"VARIABLE",
57+
"[", "[']", "[CHAR]",
58+
":NONAME", "ACTION-OF", "BUFFER:",
59+
"C\"", "COMPILE,", "DEFER", "DEFER!", "DEFER@",
60+
"ERASE", "FALSE", "HEX", "HOLDS", "IS", "MARKER", "PAD",
61+
"REFILL", "RESTORE-INPUT", "S\\\"", "SAVE-INPUT",
62+
"SOURCE-ID", "TO", "TRUE", "UNUSED", "VALUE", "WITHIN", "[COMPILE]" ];
3063

3164
module.exports = grammar({
3265
name: 'forth',
@@ -40,23 +73,47 @@ module.exports = grammar({
4073
),
4174

4275
_tokens: $ => choice(
76+
$.comment,
4377
$.string,
4478
$.number,
4579
$.builtin,
4680
$.word,
47-
$.comment
4881
),
4982

5083
string: $ => choice(
5184
/[sScC]" [^"]*"/, // s" S" c" C" followed by space and content
5285
/\." [^"]*"/, // ." followed by space and content
5386
),
5487

55-
number: $ => /('\w')|(0[xX][0-9a-fA-F]+)|(\$[0-9a-fA-F]+)|(%[01]+)|(&\d+)|\d+/,
88+
number: $ => choice(
89+
$.character_literal,
90+
$.hex_number,
91+
$.binary_number,
92+
$.octal_number,
93+
$.float_number,
94+
$.double_cell_number,
95+
$.decimal_number,
96+
),
97+
98+
character_literal: $ => /'\w'/,
99+
100+
hex_number: $ => /-?(0[xX][0-9a-fA-F]+|\$[0-9a-fA-F]+)/,
101+
102+
binary_number: $ => /-?%[01]+/,
103+
104+
octal_number: $ => /-?&\d+/,
105+
106+
float_number: $ => /-?\d+\.\d+([eE][+-]?\d+)?/,
107+
108+
double_cell_number: $ => /-?\d+\./,
109+
110+
decimal_number: $ => /-?\d+/,
56111

57112
builtin: $ => choice(
58-
$.core,
113+
$.control_flow,
114+
$.io,
59115
$.operator,
116+
$.core,
60117
),
61118

62119
start_definition: $ => ":",
@@ -80,9 +137,13 @@ module.exports = grammar({
80137

81138
block_comment: $ => /\( [^)]*\)/,
82139

140+
control_flow: $ => choice(...control_flow.map(x => literal(x))),
141+
142+
io: $ => choice(...io_words.map(x => literal(x))),
143+
83144
core: $ => choice(...builtin_core.map(x => literal(x))),
84145

85-
operator: _ => choice(...builtin_oprs),
146+
operator: $ => choice(...builtin_oprs.map(x => literal(x))),
86147

87148
word: $ => /\S+/,
88149
}

queries/highlights.scm

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,37 @@
1+
; Definition keywords
12
[
23
(start_definition)
34
(end_definition)
45
] @keyword
56

6-
(number) @constant
7+
; Control flow - highlighted as keywords for prominence
8+
(control_flow) @keyword.control
79

8-
(string) @string
9-
10-
(word) @function
10+
; I/O operations
11+
(io) @function.builtin
1112

12-
(comment) @comment
13+
; Operators - arithmetic, logic, stack manipulation
14+
(operator) @operator
1315

16+
; Core builtins - defining words, memory, etc.
1417
(core) @type
1518

16-
(operator) @operator
19+
; Numbers - all subtypes
20+
(character_literal) @constant.character
21+
(hex_number) @constant.numeric
22+
(binary_number) @constant.numeric
23+
(octal_number) @constant.numeric
24+
(float_number) @constant.numeric
25+
(double_cell_number) @constant.numeric
26+
(decimal_number) @constant.numeric
27+
28+
; Strings
29+
(string) @string
30+
31+
; Comments - different types
32+
(line_comment) @comment.line
33+
(block_comment) @comment.block
34+
(stack_effect) @comment.documentation
35+
36+
; User-defined words
37+
(word) @function

0 commit comments

Comments
 (0)