fix: Improve AsciiDoc→Markdown converter in generate-llms-txt

raifdmueller · claude · raifdmueller · commit 11dcdb58583c · 2026-02-20T21:51:28.000+01:00
- Fix table conversion: use block-level parser (convertAdocTable) that
  correctly handles cells on separate lines → proper Markdown table
- Fix line continuation: remove AsciiDoc '+' continuation marker
- Fix nested definition lists: term::: now converted to '- **term**:'
- Fix relative links: ../file.adoc → absolute GitHub URLs
- Fix def-list regex: use [^\S\n]* instead of \s* to prevent matching
  across newlines (was causing term desc to bleed into next paragraph)

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/scripts/generate-llms-txt.js b/scripts/generate-llms-txt.js
@@ -17,6 +17,51 @@ const categories = JSON.parse(
   fs.readFileSync(path.join(ROOT, 'website/public/data/categories.json'), 'utf-8')
 )
 
+// ─── AsciiDoc table converter ────────────────────────────────────────────────
+
+function convertAdocTable(body) {
+  const lines = body.split('\n')
+  const allCells = []
+
+  for (const line of lines) {
+    const trimmed = line.trim()
+    if (!trimmed || !trimmed.startsWith('|')) continue
+    // Split line into cells: |cell1 |cell2 → ['cell1', 'cell2']
+    const parts = trimmed.split(/(?=\|)/).filter(Boolean)
+    for (const part of parts) {
+      if (part.startsWith('|')) allCells.push(part.slice(1).trim())
+    }
+  }
+
+  if (allCells.length === 0) return ''
+
+  // Determine column count from the first line that has cells
+  let colCount = 0
+  for (const line of lines) {
+    const trimmed = line.trim()
+    if (!trimmed || !trimmed.startsWith('|')) continue
+    colCount = trimmed.split(/(?=\|)/).filter(Boolean).length
+    if (colCount > 0) break
+  }
+  if (colCount <= 0) colCount = 2
+
+  // Group cells into rows
+  const rows = []
+  for (let i = 0; i < allCells.length; i += colCount) {
+    rows.push(allCells.slice(i, i + colCount))
+  }
+
+  if (rows.length === 0) return ''
+
+  const out = []
+  out.push('| ' + rows[0].join(' | ') + ' |')
+  out.push('| ' + rows[0].map(() => '---').join(' | ') + ' |')
+  for (const row of rows.slice(1)) {
+    if (row.length > 0) out.push('| ' + row.join(' | ') + ' |')
+  }
+  return out.join('\n')
+}
+
 // ─── AsciiDoc → Markdown converter ──────────────────────────────────────────
 
 function adocToMarkdown(adoc) {
@@ -44,25 +89,35 @@ function adocToMarkdown(adoc) {
   md = md.replace(/^\[%collapsible\]\s*$/gm, '')
   md = md.replace(/^====\s*$/gm, '')
 
-  // Tables |=== → remove delimiters
-  md = md.replace(/^\|===\s*$/gm, '')
-
-  // Table rows: |cell content → keep, clean up leading pipe
-  md = md.replace(/^\|(.+)$/gm, (_, row) => {
-    const cells = row.split('|').map((c) => c.trim()).filter(Boolean)
-    return '| ' + cells.join(' | ') + ' |'
-  })
+  // Tables: convert full blocks including optional attribute line (handles multi-line cells)
+  md = md.replace(/(?:\[[^\]]*\]\s*\n)?\|===\s*\n([\s\S]*?)\|===\s*/g, (_, body) =>
+    convertAdocTable(body)
+  )
 
-  // Remove block attribute lines
+  // Remove remaining block attribute lines
   md = md.replace(/^\[(?:horizontal|sidebar|cols[^\]]*|options[^\]]*|%\w+[^\]]*)\]\s*$/gm, '')
 
+  // AsciiDoc line continuation (+) → remove
+  md = md.replace(/^\+\s*$/gm, '')
+
+  // Nested definition lists: term::: description → - **term**: description
+  md = md.replace(/^([^:\n|#`>]+):::[^\S\n]*(.*)$/gm, (_, term, desc) =>
+    desc.trim() ? `- **${term.trim()}**: ${desc.trim()}` : `- **${term.trim()}**`
+  )
+
   // Definition lists: term:: description → **term**: description
-  md = md.replace(/^([^:\n|#`>]+)::\s*(.*)$/gm, (_, term, desc) =>
+  // Use [^\S\n]* (horizontal whitespace only) to avoid matching across newlines
+  md = md.replace(/^([^:\n|#`>]+)::[^\S\n]*(.*)$/gm, (_, term, desc) =>
     desc.trim() ? `**${term.trim()}**: ${desc.trim()}` : `**${term.trim()}**`
   )
 
-  // Links: link:url[text] → [text](url)
-  md = md.replace(/link:([^\[]+)\[([^\]]*)\]/g, '[$2]($1)')
+  // Links: link:url[text] → [text](url), resolve relative .adoc paths to GitHub URLs
+  md = md.replace(/link:([^\[]+)\[([^\]]*)\]/g, (_, url, text) => {
+    if (/^\.\.\/.*\.adoc$/.test(url)) {
+      url = 'https://github.com/LLM-Coding/Semantic-Anchors/blob/main/' + url.slice(3)
+    }
+    return `[${text}](${url})`
+  })
 
   // Cross-references: <<id,text>> → text, <<id>> → `id`
   md = md.replace(/<<([^,>]+),([^>]+)>>/g, '$2')
diff --git a/website/public/llms.txt b/website/public/llms.txt
@@ -86,26 +86,13 @@ Not every well-known term qualifies as a semantic anchor. Understanding what **d
 These terms are frequently used but **lack the depth and definition** required for semantic anchors:
 
 | ❌ Not a Semantic Anchor | Why Not? |
-
-| **"TLDR"** |
-| Underspecified instruction with no defined structure or methodology. Just means "be brief" without any framework for how. |
-
-| **"ELI5"** (Explain Like I'm 5) |
-| Vague target level with no pedagogical framework. What does "5-year-old level" mean technically? No consistent interpretation. |
-
-| **"Keep it short"** |
-| Pure directive with no conceptual depth. It's an instruction, not a methodology. |
-
-| **"Best practices"** |
-| Too vague and not attributable. Whose best practices? Based on what research or authority? |
-
-| **"Modern approach"** |
-| Ambiguous and inconsistent. "Modern" means different things to different people and changes over time. |
-
-| **"Make it simple"** |
-| No reference to specific simplification frameworks (unlike KISS principle or Occam's Razor which **are** semantic anchors). |
-
-### Comparison: Good vs. Bad
+| --- | --- |
+| **"TLDR"** | Underspecified instruction with no defined structure or methodology. Just means "be brief" without any framework for how. |
+| **"ELI5"** (Explain Like I'm 5) | Vague target level with no pedagogical framework. What does "5-year-old level" mean technically? No consistent interpretation. |
+| **"Keep it short"** | Pure directive with no conceptual depth. It's an instruction, not a methodology. |
+| **"Best practices"** | Too vague and not attributable. Whose best practices? Based on what research or authority? |
+| **"Modern approach"** | Ambiguous and inconsistent. "Modern" means different things to different people and changes over time. |
+| **"Make it simple"** | No reference to specific simplification frameworks (unlike KISS principle or Occam's Razor which **are** semantic anchors). |### Comparison: Good vs. Bad
 
 **Example 1: Testing Instructions**
 
@@ -275,7 +262,7 @@ If the LLM provides accurate, detailed information, it's safe to use that anchor
 
 ## Contributing
 
-We welcome proposals for new semantic anchors! See our [Contributing Guide](../CONTRIBUTING.adoc) for details on:
+We welcome proposals for new semantic anchors! See our [Contributing Guide](https://github.com/LLM-Coding/Semantic-Anchors/blob/main/CONTRIBUTING.adoc) for details on:
 
 * How to propose a new anchor
 * Quality criteria and testing methodology
@@ -740,7 +727,8 @@ to specific team members.
 * Communicating the nature of changes to teammates, the public, and other stakeholders
 **Schema**: <type>[!][(optional scope)]: <description> + optional body/footer
 
-**Common Types**: * **feat:** - introduce new feature to the codebase (-> Semver Minor)
+**Common Types**
+* **feat:** - introduce new feature to the codebase (-> Semver Minor)
 * **fix:** - patches a bug in your codebase (-> SemVer Patch)
 * **docs:** - documentation improvements to the codebase
 * **chore:** - codebase/repository housekeeping changes
@@ -807,11 +795,12 @@ to specific team members.
 
 **Core Concepts**:
 
-**Version format**: +
-**MAJOR**: : Incompatible API changes (breaking changes)
-**MINOR**: : Backward-compatible functionality additions
-**PATCH**: : Backward-compatible bug fixes
-**Pre-release versions**: : Append hyphen and identifiers (e.g., 1.0.0-alpha.1)
+**Version format**
+
+- **MAJOR**: Incompatible API changes (breaking changes)
+- **MINOR**: Backward-compatible functionality additions
+- **PATCH**: Backward-compatible bug fixes
+- **Pre-release versions**: Append hyphen and identifiers (e.g., 1.0.0-alpha.1)
 
 **Build metadata**: Append plus sign and identifiers (e.g., 1.0.0+20241111)
 
@@ -1047,13 +1036,15 @@ to specific team members.
 
 **Core Concepts**:
 
-**Four documentation types**: +
-**Tutorials**: : Learning-oriented, lessons for beginners
-**How-to guides**: : Task-oriented, directions for specific goals
-**Reference**: : Information-oriented, technical descriptions
-**Explanation**: : Understanding-oriented, conceptual discussions
+**Four documentation types**
+
+- **Tutorials**: Learning-oriented, lessons for beginners
+- **How-to guides**: Task-oriented, directions for specific goals
+- **Reference**: Information-oriented, technical descriptions
+- **Explanation**: Understanding-oriented, conceptual discussions
 
-**Two dimensions**: * Practical vs. Theoretical
+**Two dimensions**
+* Practical vs. Theoretical
 * Acquisition (learning) vs. Application (working)
 
 **Separation of concerns**: Each type serves a distinct purpose
@@ -1621,7 +1612,8 @@ Filter by constraints (e.g., "AsciiDoc + Hugo not well-supported")
 
 **Lightweight documentation**: Short, focused records
 
-**Standard structure**: * Title
+**Standard structure**
+* Title
 * Status (proposed, accepted, deprecated, superseded)
 * Context (forces at play)
 * Decision (what was chosen)
@@ -1694,11 +1686,12 @@ Filter by constraints (e.g., "AsciiDoc + Hugo not well-supported")
 
 **Core Concepts**:
 
-**Four levels of abstraction**: +
-**Level 1 - Context**: : System in its environment (users, external systems)
-**Level 2 - Container**: : Applications and data stores that make up the system
-**Level 3 - Component**: : Components within containers
-**Level 4 - Code**: : Class diagrams, entity relationships (optional)
+**Four levels of abstraction**
+
+- **Level 1 - Context**: System in its environment (users, external systems)
+- **Level 2 - Container**: Applications and data stores that make up the system
+- **Level 3 - Component**: Components within containers
+- **Level 4 - Code**: Class diagrams, entity relationships (optional)
 
 **Zoom in/out**: Progressive disclosure of detail
 
@@ -1826,7 +1819,8 @@ Filter by constraints (e.g., "AsciiDoc + Hugo not well-supported")
 
 **Structured template**: Well-defined format with specific sections
 
-**Standard fields**: * Title (short noun phrase)
+**Standard fields**
+* Title (short noun phrase)
 * Status (proposed, accepted, rejected, deprecated, superseded)
 * Context and Problem Statement
 * Decision Drivers (forces influencing the decision)
@@ -1882,7 +1876,8 @@ Filter by constraints (e.g., "AsciiDoc + Hugo not well-supported")
 
 **Special Cause Variation**: Assignable, correctable deviation
 
-**Chart Types**: * X-bar Chart: Subgroup means
+**Chart Types**
+* X-bar Chart: Subgroup means
 * R-Chart: Subgroup ranges
 * I-MR Chart: Individual values and moving range
 * p-Chart: Defect proportions
@@ -2014,12 +2009,13 @@ SPC:: Nelson Rules are a tool within Statistical Process Control
 
 **Core Concepts**:
 
-**Five domains**: +
-**Clear (formerly "Simple")**: : Best practices apply, sense-categorize-respond
-**Complicated**: : Good practices exist, sense-analyze-respond
-**Complex**: : Emergent practices, probe-sense-respond
-**Chaotic**: : Novel practices needed, act-sense-respond
-**Confused (center)**: : Don't know which domain you're in
+**Five domains**
+
+- **Clear (formerly "Simple")**: Best practices apply, sense-categorize-respond
+- **Complicated**: Good practices exist, sense-analyze-respond
+- **Complex**: Emergent practices, probe-sense-respond
+- **Chaotic**: Novel practices needed, act-sense-respond
+- **Confused (center)**: Don't know which domain you're in
 
 **Domain transitions**: How situations move between domains
 
@@ -2182,11 +2178,12 @@ SPC:: Nelson Rules are a tool within Statistical Process Control
 
 **Core Concepts**:
 
-**Four Safety Integrity Levels**: +
-**SIL 1 (lowest)**: : 10^-2^ ≤ PFD < 10^-1^ (tolerable risk reduction)
-**SIL 2**: : 10^-3^ ≤ PFD < 10^-2^ (moderate risk reduction)
-**SIL 3**: : 10^-4^ ≤ PFD < 10^-3^ (substantial risk reduction)
-**SIL 4 (highest)**: : 10^-5^ ≤ PFD < 10^-4^ (maximum risk reduction)
+**Four Safety Integrity Levels**
+
+- **SIL 1 (lowest)**: 10^-2^ ≤ PFD < 10^-1^ (tolerable risk reduction)
+- **SIL 2**: 10^-3^ ≤ PFD < 10^-2^ (moderate risk reduction)
+- **SIL 3**: 10^-4^ ≤ PFD < 10^-3^ (substantial risk reduction)
+- **SIL 4 (highest)**: 10^-5^ ≤ PFD < 10^-4^ (maximum risk reduction)
 
 **Risk-based classification**: SIL level determined by hazard analysis and risk assessment
 
@@ -2387,7 +2384,8 @@ SPC:: Nelson Rules are a tool within Statistical Process Control
 
 **Core Concepts**:
 
-**Three layers**: * **Unit tests** (base): Many fast, isolated tests
+**Three layers**
+* **Unit tests** (base): Many fast, isolated tests
 * **Integration tests** (middle): Moderate number, test component interaction
 * **End-to-end tests** (top): Few, test complete user journeys