Merge pull request #14 from theodevelop/dev

theodevelop · web-flow · commit 3d73a88759b5 · 2026-03-23T22:58:08.000+01:00
fix: token parsing robustness — lowercase names and comment stripping
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,27 @@
 
 All notable changes to the **Bison/Flex Language Support** extension will be documented in this file.
 
+## [1.1.3] - 2026-03-23
+
+### Fixed
+
+- **Bison — comments and action blocks**: Identifiers inside `/* */` block comments
+  (including multi-line), `//` line comments, and `{ }` action blocks are no longer
+  falsely reported as undeclared tokens
+
+---
+
+## [1.1.2] - 2026-03-20
+
+### Fixed
+
+- **Bison — lowercase and mixed-case token names**: Tokens with lowercase letters or
+  digits in their name (e.g. `lower_case_tok`, `STANDARD_202x`, `MIXEDcase123`) are
+  now correctly parsed from `%token` declarations and no longer trigger false
+  "unused token" warnings
+
+---
+
 ## [1.1.1] - 2026-03-19
 
 ### Fixed
diff --git a/bison-flex-icon-theme.json b/bison-flex-icon-theme.json
diff --git a/package.json b/package.json
@@ -2,7 +2,7 @@
   "name": "bison-flex-lang",
   "displayName": "Bison/Flex Language Support",
   "description": "Full-featured language support for GNU Bison (.y, .yy) and Flex/RE-flex (.l, .ll) — syntax highlighting with embedded C/C++, real-time diagnostics, intelligent autocompletion, and hover documentation for all directives.",
-  "version": "1.1.2",
+  "version": "1.1.3",
   "publisher": "theodevelop",
   "license": "MIT",
   "repository": {
@@ -105,13 +105,6 @@
         "path": "./snippets/flex.json"
       }
     ],
-    "iconThemes": [
-      {
-        "id": "bison-flex-icons",
-        "label": "Bison/Flex File Icons",
-        "path": "./bison-flex-icon-theme.json"
-      }
-    ],
     "commands": [
       {
         "command": "bisonFlex.compileBison",
diff --git a/server/src/parser/bisonParser.ts b/server/src/parser/bisonParser.ts
@@ -35,7 +35,13 @@ const KNOWN_BISON_DIRECTIVES = new Set([
 ]);
 
 export function parseBisonDocument(text: string): BisonDocument {
-  const lines = text.split(/\r?\n/);
+  // Strip /* ... */ block comments (including multi-line) before any line-by-line
+  // processing.  Newlines inside comments are preserved so that all line numbers
+  // remain accurate for diagnostics.  Non-newline characters are replaced with
+  // spaces so column positions of surrounding tokens are unaffected.
+  const processedText = text.replace(/\/\*[\s\S]*?\*\//g, m =>
+    m.replace(/[^\n]/g, ' '));
+  const lines = processedText.split(/\r?\n/);
   const doc: BisonDocument = {
     tokens: new Map(),
     nonTerminals: new Map(),
@@ -190,13 +196,13 @@ export function parseBisonDocument(text: string): BisonDocument {
     // %expect / %expect-rr
     // %require, %language, %skeleton, etc. — parsed for awareness but no special handling
 
-    // Continuation lines for %token (indented ALL_CAPS names after a %token line)
-    if (lastTokenDirectiveLine >= 0 && i === lastTokenDirectiveLine + 1 && /^\s+[A-Z_]/.test(line)) {
+    // Continuation lines for %token (indented names after a %token line)
+    if (lastTokenDirectiveLine >= 0 && i === lastTokenDirectiveLine + 1 && /^\s+[a-zA-Z_]/.test(line)) {
       parseTokenNames(trimmed, lastTokenType, i, doc);
       lastTokenDirectiveLine = i; // allow chaining
       continue;
     }
-    if (/^\s+[A-Z_]/.test(line) && i > 0 && i <= lastTokenDirectiveLine + 1) {
+    if (/^\s+[a-zA-Z_]/.test(line) && i > 0 && i <= lastTokenDirectiveLine + 1) {
       parseTokenNames(trimmed, lastTokenType, i, doc);
       lastTokenDirectiveLine = i;
       continue;
@@ -416,7 +422,8 @@ function getFirstSymbol(text: string): string | undefined {
 
 function parseTokenNames(text: string, type: string | undefined, lineNum: number, doc: BisonDocument): void {
   // Match patterns like: NAME "alias" VALUE  or just NAME
-  const regex = /([A-Z_][A-Z0-9_]*)\s*(?:("(?:[^"\\]|\\.)*")\s*)?(?:(\d+)\s*)?/g;
+  // Use [a-zA-Z_][a-zA-Z0-9_]* to support lowercase letters and digits in token names.
+  const regex = /([a-zA-Z_][a-zA-Z0-9_]*)\s*(?:("(?:[^"\\]|\\.)*")\s*)?(?:(\d+)\s*)?/g;
   let match: RegExpExecArray | null;
   while ((match = regex.exec(text)) !== null) {
     const name = match[1];
diff --git a/syntaxes/bison.tmLanguage.json b/syntaxes/bison.tmLanguage.json
@@ -97,7 +97,7 @@
 
     "rule-prec": {
       "comment": "%prec TOKEN in rules",
-      "match": "(%prec)\\s+([A-Z_][A-Z0-9_]*)",
+      "match": "(%prec)\\s+([a-zA-Z_][a-zA-Z0-9_]*)",
       "captures": {
         "1": { "name": "keyword.control.prec.bison" },
         "2": { "name": "entity.name.constant.token.bison" }
@@ -111,9 +111,9 @@
     },
 
     "token-reference": {
-      "comment": "Token references (ALL_CAPS identifiers) in rules",
+      "comment": "Token and symbol references in grammar rules (may be ALL_CAPS, mixed-case, or lowercase)",
       "name": "entity.name.constant.token.bison",
-      "match": "\\b[A-Z_][A-Z0-9_]{1,}\\b"
+      "match": "\\b[a-zA-Z_][a-zA-Z0-9_]+\\b"
     },
 
     "semantic-values": {
@@ -156,7 +156,7 @@
 
     "inline-token-directive": {
       "comment": "%token inside rules section (e.g., %token CHUNKS \"_chunks\")",
-      "match": "^\\s*(%token)\\s+([A-Z_][A-Z0-9_]*)(?:\\s+(\"[^\"]*\"))?",
+      "match": "^\\s*(%token)\\s+([a-zA-Z_][a-zA-Z0-9_]*)(?:\\s+(\"[^\"]*\"))?",
       "captures": {
         "1": { "name": "storage.type.token.bison" },
         "2": { "name": "entity.name.constant.token.bison" },
@@ -402,7 +402,7 @@
 
     "token-continuation-line": {
       "comment": "Indented continuation lines for multi-line %token declarations",
-      "match": "^(\\s+)([A-Z_][A-Z0-9_]*(?:\\s+(?:\"[^\"]*\"|\\d+))?(?:\\s+[A-Z_][A-Z0-9_]*(?:\\s+(?:\"[^\"]*\"|\\d+))?)*)",
+      "match": "^(\\s+)([a-zA-Z_][a-zA-Z0-9_]*(?:\\s+(?:\"[^\"]*\"|\\d+))?(?:\\s+[a-zA-Z_][a-zA-Z0-9_]*(?:\\s+(?:\"[^\"]*\"|\\d+))?)*)",
       "captures": {
         "2": { "patterns": [{ "include": "#token-list" }] }
       }
@@ -413,7 +413,7 @@
       "patterns": [
         {
           "name": "entity.name.constant.token.bison",
-          "match": "[A-Z_][A-Z0-9_]*"
+          "match": "[a-zA-Z_][a-zA-Z0-9_]*"
         },
         {
           "name": "constant.numeric.bison",
diff --git a/tests/test-parsers.ts b/tests/test-parsers.ts
@@ -1889,6 +1889,149 @@ console.log('\n\n=== TEST: $n bounds — literals counted as symbols ===\n');
     `$3 out-of-bounds diagnostic has Error severity`);
 }
 
+// ════════════════════════════════════════
+// TEST: Lowercase / mixed-case token names (issue #5)
+// ════════════════════════════════════════
+console.log('\n=== TEST: Lowercase and mixed-case token names (issue #5) ===\n');
+
+{
+  // Each %token declaration with a lowercase/mixed name should produce exactly one token entry.
+  const cases: Array<{ line: string; name: string }> = [
+    { line: '%token STANDARD_202x "STANDARD-202x"', name: 'STANDARD_202x' },
+    { line: '%token lower_case_tok "lower"',         name: 'lower_case_tok' },
+    { line: '%token MIXEDcase123 "mixed"',            name: 'MIXEDcase123'  },
+    { line: '%token A_1_B_2_C "alias"',               name: 'A_1_B_2_C'    },
+  ];
+
+  for (const { line, name } of cases) {
+    const src = [line, '%%', 'start : ;', '%%'].join('\n');
+    const doc = parseBisonDocument(src);
+    assert(doc.tokens.has(name),
+      `parseTokenNames: '${line}' → token '${name}' is registered`);
+    assert(doc.tokens.size === 1,
+      `parseTokenNames: '${line}' → exactly 1 token (got ${doc.tokens.size}: ${[...doc.tokens.keys()].join(', ')})`);
+  }
+
+  // When those tokens are USED in rules → 0 "unused token" warnings.
+  const usedSrc = [
+    '%token STANDARD_202x "STANDARD-202x"',
+    '%token lower_case_tok "lower"',
+    '%token MIXEDcase123 "mixed"',
+    '%token A_1_B_2_C "alias"',
+    '%%',
+    'start : STANDARD_202x lower_case_tok MIXEDcase123 A_1_B_2_C ;',
+    '%%',
+  ].join('\n');
+  const usedDoc = parseBisonDocument(usedSrc);
+  const usedDiags = computeBisonDiagnostics(usedDoc, usedSrc);
+  const unusedWarnings = usedDiags.filter(d => d.message.includes('declared but never used'));
+  assert(unusedWarnings.length === 0,
+    `All four mixed-case tokens used in rules → 0 "unused" warnings (got ${unusedWarnings.length}: ${unusedWarnings.map(d => d.message).join('; ')})`);
+
+  // When those tokens are NOT used → exactly 1 warning each, with the full token name.
+  const unusedSrc = [
+    '%token STANDARD_202x "STANDARD-202x"',
+    '%token lower_case_tok "lower"',
+    '%token MIXEDcase123 "mixed"',
+    '%token A_1_B_2_C "alias"',
+    '%%',
+    'start : ;',
+    '%%',
+  ].join('\n');
+  const unusedDoc = parseBisonDocument(unusedSrc);
+  const unusedDiags = computeBisonDiagnostics(unusedDoc, unusedSrc);
+  const allUnused = unusedDiags.filter(d => d.message.includes('declared with %token but never used'));
+  assert(allUnused.length === 4,
+    `Four unused mixed-case tokens → exactly 4 warnings (got ${allUnused.length}: ${allUnused.map(d => d.message).join('; ')})`);
+  for (const name of ['STANDARD_202x', 'lower_case_tok', 'MIXEDcase123', 'A_1_B_2_C']) {
+    const w = allUnused.find(d => d.message.includes(`'${name}'`));
+    assert(w !== undefined,
+      `Unused token warning for '${name}' uses the full name (not a fragment)`);
+  }
+}
+
+// ════════════════════════════════════════
+// TEST: Comments in rules ignored (issue #8)
+// ════════════════════════════════════════
+console.log('\n=== TEST: Comments in rules ignored (issue #8) ===\n');
+
+{
+  // Test 1: Inline /* */ block comment — token inside must NOT be flagged
+  const src1 = [
+    '%token TOKEN_A TOKEN_B',
+    '%%',
+    'start : rule ;',
+    'rule : TOKEN_A /* FAKE_COMMENT_TOKEN */ TOKEN_B { $$ = $1; }',
+    '%%',
+  ].join('\n');
+  const doc1 = parseBisonDocument(src1);
+  const diags1 = computeBisonDiagnostics(doc1, src1);
+  const fake1 = diags1.filter(d => d.message.includes('FAKE_COMMENT_TOKEN'));
+  assert(fake1.length === 0,
+    `FAKE_COMMENT_TOKEN inside /* */ must NOT be flagged (got ${fake1.length} diag(s): ${fake1.map(d => d.message).join('; ')})`);
+
+  // Test 2: // line comment — token inside must NOT be flagged
+  const src2 = [
+    '%token TOKEN_C',
+    '%%',
+    'start : rule ;',
+    'rule : TOKEN_C // ANOTHER_FAKE_TOKEN',
+    '    { }',
+    '%%',
+  ].join('\n');
+  const doc2 = parseBisonDocument(src2);
+  const diags2 = computeBisonDiagnostics(doc2, src2);
+  const fake2 = diags2.filter(d => d.message.includes('ANOTHER_FAKE_TOKEN'));
+  assert(fake2.length === 0,
+    `ANOTHER_FAKE_TOKEN after // must NOT be flagged (got ${fake2.length} diag(s))`);
+
+  // Test 3: Action block { } — identifier inside must NOT be flagged
+  const src3 = [
+    '%token TOKEN_D',
+    '%%',
+    'start : rule ;',
+    'rule : TOKEN_D { int x = LOOKS_LIKE_TOKEN; }',
+    '%%',
+  ].join('\n');
+  const doc3 = parseBisonDocument(src3);
+  const diags3 = computeBisonDiagnostics(doc3, src3);
+  const fake3 = diags3.filter(d => d.message.includes('LOOKS_LIKE_TOKEN'));
+  assert(fake3.length === 0,
+    `LOOKS_LIKE_TOKEN inside action block { } must NOT be flagged (got ${fake3.length} diag(s))`);
+
+  // Test 4: Multi-line /* */ block comment — tokens inside must NOT be flagged
+  const src4 = [
+    '%token TOKEN_E TOKEN_F',
+    '%%',
+    'start : rule ;',
+    'rule : TOKEN_E /* FAKE_1',
+    '               FAKE_2 */ TOKEN_F',
+    '%%',
+  ].join('\n');
+  const doc4 = parseBisonDocument(src4);
+  const diags4 = computeBisonDiagnostics(doc4, src4);
+  const fake4a = diags4.filter(d => d.message.includes('FAKE_1'));
+  const fake4b = diags4.filter(d => d.message.includes('FAKE_2'));
+  assert(fake4a.length === 0,
+    `FAKE_1 inside multi-line /* */ must NOT be flagged (got ${fake4a.length} diag(s))`);
+  assert(fake4b.length === 0,
+    `FAKE_2 inside multi-line /* */ must NOT be flagged (got ${fake4b.length} diag(s))`);
+
+  // Test 5: Real undeclared token (not in a comment or action) MUST be flagged
+  const src5 = [
+    '%token TOKEN_G',
+    '%%',
+    'start : rule ;',
+    'rule : TOKEN_G UNDECLARED_TOKEN',
+    '%%',
+  ].join('\n');
+  const doc5 = parseBisonDocument(src5);
+  const diags5 = computeBisonDiagnostics(doc5, src5);
+  const undeclared5 = diags5.filter(d => d.message.includes('UNDECLARED_TOKEN'));
+  assert(undeclared5.length >= 1,
+    `UNDECLARED_TOKEN (not in /* */ or {}) MUST be flagged (got ${undeclared5.length} diag(s))`);
+}
+
 // ════════════════════════════════════════
 // SUMMARY
 // ════════════════════════════════════════