Merge pull request #26 from theodevelop/dev

theodevelop · web-flow · commit e0c6aa47f729 · 2026-03-31T23:38:58.000+02:00
fix: resolve issues #21 #22 #23 — mid-rule actions, %token alias, flex SC blocks
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,16 @@
 
 All notable changes to the **Bison/Flex Language Support** extension will be documented in this file.
 
+## [1.4.1] - 2026-03-31
+
+### Fixed
+
+- **Bison — mid-rule action `$N` out-of-bounds** (#21): Action blocks `{ }` embedded in the middle of a production are now counted as grammar symbols in Bison's `$N` numbering. Previously they were silently stripped, causing false-positive `bison/out-of-bounds` errors and missed real out-of-bounds accesses.
+- **Bison — `%token` numeric value and string alias** (#22): `%token NAME NUMBER "alias"` is now parsed in the correct order (numeric value before string alias). Previously, words inside the alias string were misidentified as token names, generating spurious `bison/undeclared-token` and `bison/unused-token` diagnostics.
+- **Flex — `<SC>{ }` block syntax** (#23): Rules grouped inside a `<SC1,SC2>{ ... }` block now correctly inherit their start conditions. Previously, the block header was misidentified as a rule pattern, suppressing all rules inside it and generating false `flex/unreachable-rule` and `flex/unused-sc` diagnostics.
+
+---
+
 ## [1.4.0] - 2026-03-30
 
 ### Added
diff --git a/package.json b/package.json
@@ -2,7 +2,7 @@
   "name": "bison-flex-lang",
   "displayName": "Bison/Flex Language Support",
   "description": "Full-featured language support for GNU Bison (.y, .yy) and Flex/RE-flex (.l, .ll) — syntax highlighting with embedded C/C++, real-time diagnostics, intelligent autocompletion, and hover documentation for all directives.",
-  "version": "1.4.0",
+  "version": "1.4.1",
   "publisher": "theodevelop",
   "license": "MIT",
   "repository": {
diff --git a/server/src/parser/bisonParser.ts b/server/src/parser/bisonParser.ts
@@ -335,11 +335,23 @@ export function parseBisonDocument(text: string): BisonDocument {
       }
     }
 
-    // Track braces
+    // Track braces — and detect when a multi-line action block opens.
+    // When braceDepth goes from 0 to >0, a mid-rule action block has started.
+    // Bison counts each action block as a grammar symbol ($N position), so we
+    // add a '__midaction__' sentinel to the current alternative's symbol list.
+    // Inline balanced blocks (e.g. `{ $$ = $1; }` on the same line) are already
+    // counted by extractSymbols; only the unbalanced-open case needs handling here.
+    const prevDepth = braceDepth;
     for (const ch of line) {
       if (ch === '{') braceDepth++;
       if (ch === '}') braceDepth = Math.max(0, braceDepth - 1);
     }
+    if (prevDepth === 0 && braceDepth > 0 && currentRule) {
+      const curRule = doc.rules.get(currentRule);
+      if (curRule && curRule.alternatives.length > 0) {
+        curRule.alternatives[curRule.alternatives.length - 1].symbols.push('__midaction__');
+      }
+    }
   }
 
   return doc;
@@ -388,7 +400,7 @@ function replaceStringLiterals(text: string): string {
  */
 function extractSymbols(text: string): string[] {
   const cleaned = replaceStringLiterals(text)
-    .replace(/\{[^}]*\}/g, ' ')                   // remove inline actions
+    .replace(/\{[^}]*\}/g, ' __midaction__ ')     // inline actions count as a symbol ($N position)
     .replace(/%prec\s+\S+/g, ' ')                 // remove %prec TOKEN
     .replace(/%empty/g, ' ')                      // remove %empty
     .replace(/\/\/.*$/g, ' ')                     // remove line comments
@@ -422,23 +434,85 @@ function getFirstSymbol(text: string): string | undefined {
 }
 
 function parseTokenNames(text: string, type: string | undefined, lineNum: number, doc: BisonDocument, colOffset: number = 0): void {
-  // Match patterns like: NAME "alias" VALUE  or just NAME
-  // Use [a-zA-Z_][a-zA-Z0-9_]* to support lowercase letters and digits in token names.
-  const regex = /([a-zA-Z_][a-zA-Z0-9_]*)\s*(?:("(?:[^"\\]|\\.)*")\s*)?(?:(\d+)\s*)?/g;
-  let match: RegExpExecArray | null;
-  while ((match = regex.exec(text)) !== null) {
-    const name = match[1];
-    const alias = match[2]?.replace(/"/g, '');
-    const value = match[3] ? parseInt(match[3]) : undefined;
-    const col = colOffset + match.index;
-    const decl: TokenDeclaration = {
-      name,
-      type,
-      alias,
-      location: Range.create(lineNum, col, lineNum, col + name.length),
-      value,
-    };
-    doc.tokens.set(name, decl);
+  // Bison token declaration syntax: NAME [NUMBER] ["alias"]  (repeating)
+  // The optional NUMBER comes BEFORE the optional "alias".
+  // We use a character scanner to correctly skip string literals and numeric values
+  // so that words inside "end of file" are not mistaken for token names.
+  let pos = 0;
+
+  while (pos < text.length) {
+    // Skip whitespace
+    while (pos < text.length && (text[pos] === ' ' || text[pos] === '\t')) pos++;
+    if (pos >= text.length) break;
+
+    const ch = text[pos];
+
+    // Skip string literals (these are aliases for the previous token, not new token names)
+    if (ch === '"') {
+      pos++;
+      while (pos < text.length && text[pos] !== '"') {
+        if (text[pos] === '\\') pos++;  // skip escaped character
+        pos++;
+      }
+      pos++;  // skip closing quote
+      continue;
+    }
+
+    // Skip numeric token values
+    if (ch >= '0' && ch <= '9') {
+      while (pos < text.length && text[pos] >= '0' && text[pos] <= '9') pos++;
+      continue;
+    }
+
+    // Match identifier (token name)
+    if (/[a-zA-Z_]/.test(ch)) {
+      const nameStart = pos;
+      while (pos < text.length && /[a-zA-Z0-9_]/.test(text[pos])) pos++;
+      const name = text.substring(nameStart, pos);
+      const col = colOffset + nameStart;
+
+      // Peek ahead: optional NUMBER then optional "alias"
+      let peekPos = pos;
+      let alias: string | undefined;
+      let value: number | undefined;
+
+      // Skip whitespace
+      while (peekPos < text.length && (text[peekPos] === ' ' || text[peekPos] === '\t')) peekPos++;
+
+      // Optional numeric token code (e.g. %token TOKEN_EOF 0 "end of file")
+      if (peekPos < text.length && text[peekPos] >= '0' && text[peekPos] <= '9') {
+        const numStart = peekPos;
+        while (peekPos < text.length && text[peekPos] >= '0' && text[peekPos] <= '9') peekPos++;
+        value = parseInt(text.substring(numStart, peekPos), 10);
+        pos = peekPos;
+        while (peekPos < text.length && (text[peekPos] === ' ' || text[peekPos] === '\t')) peekPos++;
+      }
+
+      // Optional string alias (e.g. %token PLUS "+" or %token TOKEN_EOF 0 "end of file")
+      if (peekPos < text.length && text[peekPos] === '"') {
+        peekPos++;  // skip opening quote
+        const aliasStart = peekPos;
+        while (peekPos < text.length && text[peekPos] !== '"') {
+          if (text[peekPos] === '\\') peekPos++;  // skip escaped character
+          peekPos++;
+        }
+        alias = text.substring(aliasStart, peekPos);
+        peekPos++;  // skip closing quote
+        pos = peekPos;
+      }
+
+      doc.tokens.set(name, {
+        name,
+        type,
+        alias,
+        location: Range.create(lineNum, col, lineNum, col + name.length),
+        value,
+      });
+      continue;
+    }
+
+    // Skip any other character (e.g. punctuation, stray closing >)
+    pos++;
   }
 }
 
diff --git a/server/src/parser/flexParser.ts b/server/src/parser/flexParser.ts
@@ -198,7 +198,17 @@ export function parseFlexDocument(text: string): FlexDocument {
   }
 
   // Phase 3: Parse rules section
-  let braceDepth = 0;
+  //
+  // Two separate depth counters are needed:
+  //   actionDepth  — depth of C action blocks ({ ... }); content is skipped
+  //   scBlockStack — stack of start-condition lists for <SC>{...} blocks;
+  //                  rules INSIDE these blocks inherit the SC names.
+  //
+  // A <SC>{...} block is NOT an action block: its content is Flex rules.
+  // Action blocks nested inside a <SC> block increment actionDepth as usual.
+  let actionDepth = 0;
+  const scBlockStack: string[][] = [];   // each entry = SC list for one nesting level
+  let pendingScHeader: string | null = null; // accumulates multi-line <SC1,\nSC2>{ headers
   inBlockComment = false;
 
   for (let i = rulesStart; i < rulesEnd; i++) {
@@ -218,16 +228,70 @@ export function parseFlexDocument(text: string): FlexDocument {
     // Skip empty lines and line comments
     if (!trimmed || trimmed.startsWith('//')) continue;
 
-    // Skip action blocks (brace-delimited C code)
-    if (braceDepth > 0) {
+    // ── Handle multi-line <SC1,\nSC2>{ header continuation ────────────────────
+    if (pendingScHeader !== null) {
+      const closeIdx = trimmed.indexOf('>');
+      if (closeIdx >= 0) {
+        // Collect any additional SC names before the >
+        const before = trimmed.substring(0, closeIdx);
+        const moreConds = before.match(/[A-Z_][A-Z0-9_]*/g);
+        if (moreConds) pendingScHeader += ',' + moreConds.join(',');
+        const conds = pendingScHeader.replace(/^,+/, '').split(',').filter(s => s.length > 0);
+        pendingScHeader = null;
+        // Expect '{' right after '>' to open the SC block
+        const after = trimmed.substring(closeIdx + 1).trim();
+        if (after === '{') {
+          scBlockStack.push(conds);
+          // actionDepth stays 0; the { is the SC block opening, not an action block
+        }
+      } else {
+        // Still accumulating conditions from this line
+        const moreConds = trimmed.match(/[A-Z_][A-Z0-9_]*/g);
+        if (moreConds) pendingScHeader += ',' + moreConds.join(',');
+      }
+      continue;
+    }
+
+    // ── Skip C action blocks ───────────────────────────────────────────────────
+    if (actionDepth > 0) {
       for (const ch of line) {
-        if (ch === '{') braceDepth++;
-        if (ch === '}') braceDepth = Math.max(0, braceDepth - 1);
+        if (ch === '{') actionDepth++;
+        if (ch === '}') actionDepth = Math.max(0, actionDepth - 1);
       }
       continue;
     }
 
-    // Extract start condition references: <SC_NAME> or <SC1,SC2>
+    // ── SC block closing } (at SC block level, actionDepth === 0) ─────────────
+    if (scBlockStack.length > 0 && trimmed === '}') {
+      scBlockStack.pop();
+      continue;
+    }
+
+    // ── SC block opener: <SC1,SC2>{ ───────────────────────────────────────────
+    // Single-line header: <SC1,SC2>{ or <SC1,SC2> {
+    {
+      const scBlockMatch = trimmed.match(/^<([A-Z_][A-Z0-9_]*(?:,[A-Z_][A-Z0-9_]*)*)>\s*\{/);
+      if (scBlockMatch) {
+        const conds = scBlockMatch[1].split(',');
+        scBlockStack.push(conds);
+        // Record the start condition references from the block header line
+        for (const cond of conds) {
+          const col = line.indexOf(cond);
+          const range = Range.create(i, col >= 0 ? col : 0, i, (col >= 0 ? col : 0) + cond.length);
+          if (!doc.startConditionRefs.has(cond)) doc.startConditionRefs.set(cond, []);
+          doc.startConditionRefs.get(cond)!.push(range);
+        }
+        continue;
+      }
+      // Multi-line header start: <SC1,   (no closing > on this line)
+      const scMultiStart = trimmed.match(/^<([A-Z_][A-Z0-9_]*(?:,[A-Z_][A-Z0-9_]*)*,\s*)$/);
+      if (scMultiStart) {
+        pendingScHeader = scMultiStart[1].replace(/,\s*$/, '');
+        continue;
+      }
+    }
+
+    // ── Extract start condition references: <SC_NAME> or <SC1,SC2> ────────────
     // Exclude <<EOF>> which is a special pattern, not a start condition
     const scRefs = line.matchAll(/(?<!<)<([A-Z_][A-Z0-9_]*(?:,[A-Z_][A-Z0-9_]*)*)>(?!>)/g);
     for (const m of scRefs) {
@@ -242,7 +306,7 @@ export function parseFlexDocument(text: string): FlexDocument {
       }
     }
 
-    // Extract abbreviation references: {name} (but not C code {})
+    // ── Extract abbreviation references: {name} (but not C code {}) ───────────
     // Only match {name} where name is a valid identifier
     const abbrRefs = line.matchAll(/\{([a-zA-Z_][a-zA-Z0-9_]*)\}/g);
     for (const m of abbrRefs) {
@@ -259,11 +323,15 @@ export function parseFlexDocument(text: string): FlexDocument {
       }
     }
 
-    // Build rule entry
-    const startConditions: string[] = [];
+    // ── Build rule entry ───────────────────────────────────────────────────────
+    // Start conditions: explicit <SC> prefix on this line PLUS any inherited from <SC>{ block
+    const inherited = scBlockStack.length > 0 ? scBlockStack[scBlockStack.length - 1] : [];
+    const startConditions: string[] = [...inherited];
     const scMatch = trimmed.match(/^<([A-Z_][A-Z0-9_]*(?:,[A-Z_][A-Z0-9_]*)*)>/);
     if (scMatch) {
-      startConditions.push(...scMatch[1].split(','));
+      for (const c of scMatch[1].split(',')) {
+        if (!startConditions.includes(c)) startConditions.push(c);
+      }
     }
 
     doc.rules.push({
@@ -272,10 +340,10 @@ export function parseFlexDocument(text: string): FlexDocument {
       location: Range.create(i, 0, i, line.length),
     });
 
-    // Track braces for action blocks
+    // ── Track action brace depth ───────────────────────────────────────────────
     for (const ch of line) {
-      if (ch === '{') braceDepth++;
-      if (ch === '}') braceDepth = Math.max(0, braceDepth - 1);
+      if (ch === '{') actionDepth++;
+      if (ch === '}') actionDepth = Math.max(0, actionDepth - 1);
     }
   }
 
diff --git a/tests/test-diagnostic-codes.ts b/tests/test-diagnostic-codes.ts
@@ -111,8 +111,9 @@ console.log('\n=== TEST: Bison diagnostic codes ===');
 }
 
 // out-of-bounds + href
+// Rule `expr : A { $3; }` has 2 symbols: A(1) + mid-rule action(2).  $3 > 2 → OOB.
 {
-  const src = `%token A\n%%\nexpr : A { $2; } ;\n%%\n`;
+  const src = `%token A\n%%\nexpr : A { $3; } ;\n%%\n`;
   const doc = parseBisonDocument(src);
   const diags = computeBisonDiagnostics(doc, src);
   const d = diags.find(x => x.message.includes('out of bounds'));
@@ -321,6 +322,61 @@ const bisonDocCross = parseBisonDocument(bisonSrcCross);
   assert(d?.source === 'flex',                      'missing-grammar-token source is flex',  d?.source);
 }
 
+// ─────────────────────────────────────────────────────────────────────────────
+// 5. Regression tests for reported bugs
+// ─────────────────────────────────────────────────────────────────────────────
+console.log('\n=== TEST: Bug regressions ===');
+
+// Issue #22 — %token with numeric value + string alias: words inside "end of file"
+// must NOT be treated as token names.
+{
+  const src = '%token TOKEN_EOF 0 "end of file"\n%token THREEDIMENSIONAL "3D"\n%token ACTUAL\n%%\nexpr : TOKEN_EOF THREEDIMENSIONAL ACTUAL ;\n%%\n';
+  const doc = parseBisonDocument(src);
+  assert(doc.tokens.has('TOKEN_EOF'),          '#22 TOKEN_EOF is declared');
+  assert(doc.tokens.get('TOKEN_EOF')?.value === 0,               '#22 TOKEN_EOF value = 0');
+  assert(doc.tokens.get('TOKEN_EOF')?.alias === 'end of file',   '#22 TOKEN_EOF alias = "end of file"');
+  assert(!doc.tokens.has('end'),               '#22 "end" is NOT a token (was inside alias)');
+  assert(!doc.tokens.has('of'),                '#22 "of" is NOT a token');
+  assert(!doc.tokens.has('file'),              '#22 "file" is NOT a token');
+  assert(doc.tokens.has('THREEDIMENSIONAL'),   '#22 THREEDIMENSIONAL is declared');
+  assert(doc.tokens.get('THREEDIMENSIONAL')?.alias === '3D', '#22 THREEDIMENSIONAL alias = "3D"');
+  assert(doc.tokens.has('ACTUAL'),             '#22 ACTUAL is declared');
+  const diags22 = computeBisonDiagnostics(doc, src);
+  const unusedTokenDiags = diags22.filter(d => d.code === 'bison/unused-token');
+  assert(unusedTokenDiags.length === 0, '#22 no false bison/unused-token diagnostics');
+}
+
+// Issue #21 — mid-rule action blocks count as grammar symbols.
+// In `testrule: A B { } D { $4 }`, $4 refers to D (symbol #4), not out of bounds.
+{
+  const src = '%token A B D\n%%\ntestrule : A B { } D { $4; } ;\n%%\n';
+  const doc = parseBisonDocument(src);
+  const diags21 = computeBisonDiagnostics(doc, src);
+  const oob = diags21.filter(d => d.code === 'bison/out-of-bounds');
+  assert(oob.length === 0, '#21 $4 in rule with mid-action is not out-of-bounds (A=1 B=2 {action}=3 D=4)');
+}
+{
+  // $6 IS out of bounds: A(1) B(2) {action}(3) D(4) {action2}(5) — only 5 symbols
+  const src = '%token A B D\n%%\ntestrule : A B { } D { $6; } ;\n%%\n';
+  const doc = parseBisonDocument(src);
+  const diags21b = computeBisonDiagnostics(doc, src);
+  const oob5 = diags21b.filter(d => d.code === 'bison/out-of-bounds');
+  assert(oob5.length === 1, '#21 $6 IS out-of-bounds (5 symbols: A B {action} D {action2})');
+}
+
+// Issue #23 — rules inside a <SC>{ ... } block inherit the start condition.
+// A catch-all `.` in INITIAL should NOT shadow rules in an exclusive SC block.
+{
+  const src = `%x MY_STATE\n%%\n.\t{}\n<MY_STATE>{\n  [a-z]+ {}\n  [0-9]+ {}\n}\n%%\n`;
+  const doc = require('../server/src/parser/flexParser').parseFlexDocument(src);
+  // Rules inside <MY_STATE>{ } should have startConditions = ['MY_STATE'], not []
+  const rulesInBlock = (doc.rules as { startConditions: string[] }[]).filter(r => r.startConditions.includes('MY_STATE'));
+  assert(rulesInBlock.length === 2, '#23 rules inside <SC>{ block inherit start condition');
+  const diags23 = computeFlexDiagnostics(doc, src);
+  const unreachable = diags23.filter(d => d.code === 'flex/unreachable-rule');
+  assert(unreachable.length === 0, '#23 no false flex/unreachable-rule for exclusive SC block');
+}
+
 // ─────────────────────────────────────────────────────────────────────────────
 // Results
 // ─────────────────────────────────────────────────────────────────────────────
diff --git a/tests/test-parsers.ts b/tests/test-parsers.ts