Skip to content

Commit 3d73a88

Browse files
authored
Merge pull request #14 from theodevelop/dev
fix: token parsing robustness — lowercase names and comment stripping
2 parents d12a5d2 + eac11a0 commit 3d73a88

File tree

6 files changed

+183
-40
lines changed

6 files changed

+183
-40
lines changed

CHANGELOG.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,27 @@
22

33
All notable changes to the **Bison/Flex Language Support** extension will be documented in this file.
44

5+
## [1.1.3] - 2026-03-23
6+
7+
### Fixed
8+
9+
- **Bison — comments and action blocks**: Identifiers inside `/* */` block comments
10+
(including multi-line), `//` line comments, and `{ }` action blocks are no longer
11+
falsely reported as undeclared tokens
12+
13+
---
14+
15+
## [1.1.2] - 2026-03-20
16+
17+
### Fixed
18+
19+
- **Bison — lowercase and mixed-case token names**: Tokens with lowercase letters or
20+
digits in their name (e.g. `lower_case_tok`, `STANDARD_202x`, `MIXEDcase123`) are
21+
now correctly parsed from `%token` declarations and no longer trigger false
22+
"unused token" warnings
23+
24+
---
25+
526
## [1.1.1] - 2026-03-19
627

728
### Fixed

bison-flex-icon-theme.json

Lines changed: 0 additions & 21 deletions
This file was deleted.

package.json

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"name": "bison-flex-lang",
33
"displayName": "Bison/Flex Language Support",
44
"description": "Full-featured language support for GNU Bison (.y, .yy) and Flex/RE-flex (.l, .ll) — syntax highlighting with embedded C/C++, real-time diagnostics, intelligent autocompletion, and hover documentation for all directives.",
5-
"version": "1.1.2",
5+
"version": "1.1.3",
66
"publisher": "theodevelop",
77
"license": "MIT",
88
"repository": {
@@ -105,13 +105,6 @@
105105
"path": "./snippets/flex.json"
106106
}
107107
],
108-
"iconThemes": [
109-
{
110-
"id": "bison-flex-icons",
111-
"label": "Bison/Flex File Icons",
112-
"path": "./bison-flex-icon-theme.json"
113-
}
114-
],
115108
"commands": [
116109
{
117110
"command": "bisonFlex.compileBison",

server/src/parser/bisonParser.ts

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,13 @@ const KNOWN_BISON_DIRECTIVES = new Set([
3535
]);
3636

3737
export function parseBisonDocument(text: string): BisonDocument {
38-
const lines = text.split(/\r?\n/);
38+
// Strip /* ... */ block comments (including multi-line) before any line-by-line
39+
// processing. Newlines inside comments are preserved so that all line numbers
40+
// remain accurate for diagnostics. Non-newline characters are replaced with
41+
// spaces so column positions of surrounding tokens are unaffected.
42+
const processedText = text.replace(/\/\*[\s\S]*?\*\//g, m =>
43+
m.replace(/[^\n]/g, ' '));
44+
const lines = processedText.split(/\r?\n/);
3945
const doc: BisonDocument = {
4046
tokens: new Map(),
4147
nonTerminals: new Map(),
@@ -190,13 +196,13 @@ export function parseBisonDocument(text: string): BisonDocument {
190196
// %expect / %expect-rr
191197
// %require, %language, %skeleton, etc. — parsed for awareness but no special handling
192198

193-
// Continuation lines for %token (indented ALL_CAPS names after a %token line)
194-
if (lastTokenDirectiveLine >= 0 && i === lastTokenDirectiveLine + 1 && /^\s+[A-Z_]/.test(line)) {
199+
// Continuation lines for %token (indented names after a %token line)
200+
if (lastTokenDirectiveLine >= 0 && i === lastTokenDirectiveLine + 1 && /^\s+[a-zA-Z_]/.test(line)) {
195201
parseTokenNames(trimmed, lastTokenType, i, doc);
196202
lastTokenDirectiveLine = i; // allow chaining
197203
continue;
198204
}
199-
if (/^\s+[A-Z_]/.test(line) && i > 0 && i <= lastTokenDirectiveLine + 1) {
205+
if (/^\s+[a-zA-Z_]/.test(line) && i > 0 && i <= lastTokenDirectiveLine + 1) {
200206
parseTokenNames(trimmed, lastTokenType, i, doc);
201207
lastTokenDirectiveLine = i;
202208
continue;
@@ -416,7 +422,8 @@ function getFirstSymbol(text: string): string | undefined {
416422

417423
function parseTokenNames(text: string, type: string | undefined, lineNum: number, doc: BisonDocument): void {
418424
// Match patterns like: NAME "alias" VALUE or just NAME
419-
const regex = /([A-Z_][A-Z0-9_]*)\s*(?:("(?:[^"\\]|\\.)*")\s*)?(?:(\d+)\s*)?/g;
425+
// Use [a-zA-Z_][a-zA-Z0-9_]* to support lowercase letters and digits in token names.
426+
const regex = /([a-zA-Z_][a-zA-Z0-9_]*)\s*(?:("(?:[^"\\]|\\.)*")\s*)?(?:(\d+)\s*)?/g;
420427
let match: RegExpExecArray | null;
421428
while ((match = regex.exec(text)) !== null) {
422429
const name = match[1];

syntaxes/bison.tmLanguage.json

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@
9797

9898
"rule-prec": {
9999
"comment": "%prec TOKEN in rules",
100-
"match": "(%prec)\\s+([A-Z_][A-Z0-9_]*)",
100+
"match": "(%prec)\\s+([a-zA-Z_][a-zA-Z0-9_]*)",
101101
"captures": {
102102
"1": { "name": "keyword.control.prec.bison" },
103103
"2": { "name": "entity.name.constant.token.bison" }
@@ -111,9 +111,9 @@
111111
},
112112

113113
"token-reference": {
114-
"comment": "Token references (ALL_CAPS identifiers) in rules",
114+
"comment": "Token and symbol references in grammar rules (may be ALL_CAPS, mixed-case, or lowercase)",
115115
"name": "entity.name.constant.token.bison",
116-
"match": "\\b[A-Z_][A-Z0-9_]{1,}\\b"
116+
"match": "\\b[a-zA-Z_][a-zA-Z0-9_]+\\b"
117117
},
118118

119119
"semantic-values": {
@@ -156,7 +156,7 @@
156156

157157
"inline-token-directive": {
158158
"comment": "%token inside rules section (e.g., %token CHUNKS \"_chunks\")",
159-
"match": "^\\s*(%token)\\s+([A-Z_][A-Z0-9_]*)(?:\\s+(\"[^\"]*\"))?",
159+
"match": "^\\s*(%token)\\s+([a-zA-Z_][a-zA-Z0-9_]*)(?:\\s+(\"[^\"]*\"))?",
160160
"captures": {
161161
"1": { "name": "storage.type.token.bison" },
162162
"2": { "name": "entity.name.constant.token.bison" },
@@ -402,7 +402,7 @@
402402

403403
"token-continuation-line": {
404404
"comment": "Indented continuation lines for multi-line %token declarations",
405-
"match": "^(\\s+)([A-Z_][A-Z0-9_]*(?:\\s+(?:\"[^\"]*\"|\\d+))?(?:\\s+[A-Z_][A-Z0-9_]*(?:\\s+(?:\"[^\"]*\"|\\d+))?)*)",
405+
"match": "^(\\s+)([a-zA-Z_][a-zA-Z0-9_]*(?:\\s+(?:\"[^\"]*\"|\\d+))?(?:\\s+[a-zA-Z_][a-zA-Z0-9_]*(?:\\s+(?:\"[^\"]*\"|\\d+))?)*)",
406406
"captures": {
407407
"2": { "patterns": [{ "include": "#token-list" }] }
408408
}
@@ -413,7 +413,7 @@
413413
"patterns": [
414414
{
415415
"name": "entity.name.constant.token.bison",
416-
"match": "[A-Z_][A-Z0-9_]*"
416+
"match": "[a-zA-Z_][a-zA-Z0-9_]*"
417417
},
418418
{
419419
"name": "constant.numeric.bison",

tests/test-parsers.ts

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1889,6 +1889,149 @@ console.log('\n\n=== TEST: $n bounds — literals counted as symbols ===\n');
18891889
`$3 out-of-bounds diagnostic has Error severity`);
18901890
}
18911891

1892+
// ════════════════════════════════════════
1893+
// TEST: Lowercase / mixed-case token names (issue #5)
1894+
// ════════════════════════════════════════
1895+
console.log('\n=== TEST: Lowercase and mixed-case token names (issue #5) ===\n');
1896+
1897+
{
1898+
// Each %token declaration with a lowercase/mixed name should produce exactly one token entry.
1899+
const cases: Array<{ line: string; name: string }> = [
1900+
{ line: '%token STANDARD_202x "STANDARD-202x"', name: 'STANDARD_202x' },
1901+
{ line: '%token lower_case_tok "lower"', name: 'lower_case_tok' },
1902+
{ line: '%token MIXEDcase123 "mixed"', name: 'MIXEDcase123' },
1903+
{ line: '%token A_1_B_2_C "alias"', name: 'A_1_B_2_C' },
1904+
];
1905+
1906+
for (const { line, name } of cases) {
1907+
const src = [line, '%%', 'start : ;', '%%'].join('\n');
1908+
const doc = parseBisonDocument(src);
1909+
assert(doc.tokens.has(name),
1910+
`parseTokenNames: '${line}' → token '${name}' is registered`);
1911+
assert(doc.tokens.size === 1,
1912+
`parseTokenNames: '${line}' → exactly 1 token (got ${doc.tokens.size}: ${[...doc.tokens.keys()].join(', ')})`);
1913+
}
1914+
1915+
// When those tokens are USED in rules → 0 "unused token" warnings.
1916+
const usedSrc = [
1917+
'%token STANDARD_202x "STANDARD-202x"',
1918+
'%token lower_case_tok "lower"',
1919+
'%token MIXEDcase123 "mixed"',
1920+
'%token A_1_B_2_C "alias"',
1921+
'%%',
1922+
'start : STANDARD_202x lower_case_tok MIXEDcase123 A_1_B_2_C ;',
1923+
'%%',
1924+
].join('\n');
1925+
const usedDoc = parseBisonDocument(usedSrc);
1926+
const usedDiags = computeBisonDiagnostics(usedDoc, usedSrc);
1927+
const unusedWarnings = usedDiags.filter(d => d.message.includes('declared but never used'));
1928+
assert(unusedWarnings.length === 0,
1929+
`All four mixed-case tokens used in rules → 0 "unused" warnings (got ${unusedWarnings.length}: ${unusedWarnings.map(d => d.message).join('; ')})`);
1930+
1931+
// When those tokens are NOT used → exactly 1 warning each, with the full token name.
1932+
const unusedSrc = [
1933+
'%token STANDARD_202x "STANDARD-202x"',
1934+
'%token lower_case_tok "lower"',
1935+
'%token MIXEDcase123 "mixed"',
1936+
'%token A_1_B_2_C "alias"',
1937+
'%%',
1938+
'start : ;',
1939+
'%%',
1940+
].join('\n');
1941+
const unusedDoc = parseBisonDocument(unusedSrc);
1942+
const unusedDiags = computeBisonDiagnostics(unusedDoc, unusedSrc);
1943+
const allUnused = unusedDiags.filter(d => d.message.includes('declared with %token but never used'));
1944+
assert(allUnused.length === 4,
1945+
`Four unused mixed-case tokens → exactly 4 warnings (got ${allUnused.length}: ${allUnused.map(d => d.message).join('; ')})`);
1946+
for (const name of ['STANDARD_202x', 'lower_case_tok', 'MIXEDcase123', 'A_1_B_2_C']) {
1947+
const w = allUnused.find(d => d.message.includes(`'${name}'`));
1948+
assert(w !== undefined,
1949+
`Unused token warning for '${name}' uses the full name (not a fragment)`);
1950+
}
1951+
}
1952+
1953+
// ════════════════════════════════════════
1954+
// TEST: Comments in rules ignored (issue #8)
1955+
// ════════════════════════════════════════
1956+
console.log('\n=== TEST: Comments in rules ignored (issue #8) ===\n');
1957+
1958+
{
1959+
// Test 1: Inline /* */ block comment — token inside must NOT be flagged
1960+
const src1 = [
1961+
'%token TOKEN_A TOKEN_B',
1962+
'%%',
1963+
'start : rule ;',
1964+
'rule : TOKEN_A /* FAKE_COMMENT_TOKEN */ TOKEN_B { $$ = $1; }',
1965+
'%%',
1966+
].join('\n');
1967+
const doc1 = parseBisonDocument(src1);
1968+
const diags1 = computeBisonDiagnostics(doc1, src1);
1969+
const fake1 = diags1.filter(d => d.message.includes('FAKE_COMMENT_TOKEN'));
1970+
assert(fake1.length === 0,
1971+
`FAKE_COMMENT_TOKEN inside /* */ must NOT be flagged (got ${fake1.length} diag(s): ${fake1.map(d => d.message).join('; ')})`);
1972+
1973+
// Test 2: // line comment — token inside must NOT be flagged
1974+
const src2 = [
1975+
'%token TOKEN_C',
1976+
'%%',
1977+
'start : rule ;',
1978+
'rule : TOKEN_C // ANOTHER_FAKE_TOKEN',
1979+
' { }',
1980+
'%%',
1981+
].join('\n');
1982+
const doc2 = parseBisonDocument(src2);
1983+
const diags2 = computeBisonDiagnostics(doc2, src2);
1984+
const fake2 = diags2.filter(d => d.message.includes('ANOTHER_FAKE_TOKEN'));
1985+
assert(fake2.length === 0,
1986+
`ANOTHER_FAKE_TOKEN after // must NOT be flagged (got ${fake2.length} diag(s))`);
1987+
1988+
// Test 3: Action block { } — identifier inside must NOT be flagged
1989+
const src3 = [
1990+
'%token TOKEN_D',
1991+
'%%',
1992+
'start : rule ;',
1993+
'rule : TOKEN_D { int x = LOOKS_LIKE_TOKEN; }',
1994+
'%%',
1995+
].join('\n');
1996+
const doc3 = parseBisonDocument(src3);
1997+
const diags3 = computeBisonDiagnostics(doc3, src3);
1998+
const fake3 = diags3.filter(d => d.message.includes('LOOKS_LIKE_TOKEN'));
1999+
assert(fake3.length === 0,
2000+
`LOOKS_LIKE_TOKEN inside action block { } must NOT be flagged (got ${fake3.length} diag(s))`);
2001+
2002+
// Test 4: Multi-line /* */ block comment — tokens inside must NOT be flagged
2003+
const src4 = [
2004+
'%token TOKEN_E TOKEN_F',
2005+
'%%',
2006+
'start : rule ;',
2007+
'rule : TOKEN_E /* FAKE_1',
2008+
' FAKE_2 */ TOKEN_F',
2009+
'%%',
2010+
].join('\n');
2011+
const doc4 = parseBisonDocument(src4);
2012+
const diags4 = computeBisonDiagnostics(doc4, src4);
2013+
const fake4a = diags4.filter(d => d.message.includes('FAKE_1'));
2014+
const fake4b = diags4.filter(d => d.message.includes('FAKE_2'));
2015+
assert(fake4a.length === 0,
2016+
`FAKE_1 inside multi-line /* */ must NOT be flagged (got ${fake4a.length} diag(s))`);
2017+
assert(fake4b.length === 0,
2018+
`FAKE_2 inside multi-line /* */ must NOT be flagged (got ${fake4b.length} diag(s))`);
2019+
2020+
// Test 5: Real undeclared token (not in a comment or action) MUST be flagged
2021+
const src5 = [
2022+
'%token TOKEN_G',
2023+
'%%',
2024+
'start : rule ;',
2025+
'rule : TOKEN_G UNDECLARED_TOKEN',
2026+
'%%',
2027+
].join('\n');
2028+
const doc5 = parseBisonDocument(src5);
2029+
const diags5 = computeBisonDiagnostics(doc5, src5);
2030+
const undeclared5 = diags5.filter(d => d.message.includes('UNDECLARED_TOKEN'));
2031+
assert(undeclared5.length >= 1,
2032+
`UNDECLARED_TOKEN (not in /* */ or {}) MUST be flagged (got ${undeclared5.length} diag(s))`);
2033+
}
2034+
18922035
// ════════════════════════════════════════
18932036
// SUMMARY
18942037
// ════════════════════════════════════════

0 commit comments

Comments
 (0)