fix(search): use AST parsing for index heading extraction

lucifer1004 · lucifer1004 · commit d5260563a477 · 2026-02-07T07:33:57.000+08:00
Regex-based heading detection in index_markdown() incorrectly captured
`#` lines inside code blocks as headings, causing `skc show` to return
truncated section content. Switched to pulldown-cmark AST parsing
(via markdown::extract_headings) which properly ignores code blocks.

Added test coverage for code-block false positives.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Fixed
+
+- Index-based section extraction now correctlyignores headings inside code blocks (WI-2026-02-02-001)
+
 ## [0.2.0] - 2026-01-31
 
 ### Added
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "skillc"
-version = "0.2.0"
+version = "0.2.1"
 edition = "2024"
 license = "MIT OR Apache-2.0"
 description = "A development kit for Agent Skills - the open format for extending AI agent capabilities"
diff --git a/gov/work/2026-02-02-fix-typst-section-truncation.toml b/gov/work/2026-02-02-fix-typst-section-truncation.toml
@@ -0,0 +1,18 @@
+[govctl]
+schema = 1
+id = "WI-2026-02-02-001"
+title = "fix typst section truncation"
+status = "done"
+created = "2026-02-02"
+started = "2026-02-02"
+completed = "2026-02-02"
+
+[content]
+description = """
+Fix index-based section extraction to ignore headings inside code blocks so
+`skc show` returns full section content for typst docs."""
+
+[[content.acceptance_criteria]]
+text = "Index-based section extraction now correctlyignores headings inside code blocks"
+status = "done"
+category = "fixed"
diff --git a/src/search.rs b/src/search.rs
@@ -6,11 +6,11 @@ use crate::config::{ensure_dir, get_cwd};
 use crate::error::{Result, SkillcError};
 use crate::index::{self, SCHEMA_VERSION};
 use crate::logging::{LogEntry, get_run_id, init_log_db, log_access_with_fallback};
+use crate::markdown;
 use crate::resolver::{ResolvedSkill, resolve_skill};
 use crate::{OutputFormat, verbose};
 use chrono::Utc;
 use crossterm::style::Stylize;
-use lazy_regex::{Lazy, Regex, lazy_regex};
 use rusqlite::{Connection, params};
 use serde::Serialize;
 use std::fs;
@@ -19,9 +19,6 @@ use std::path::{Path, PathBuf};
 use std::time::Instant;
 use walkdir::WalkDir;
 
-/// Regex for parsing markdown headings (validated at compile time).
-static HEADING_RE: Lazy<Regex> = lazy_regex!(r"^(#{1,6})\s+(.+)$");
-
 /// Search result entry.
 #[derive(Debug, Serialize)]
 pub struct SearchResult {
@@ -356,22 +353,12 @@ fn index_markdown(conn: &Connection, source_dir: &Path, file_path: &Path) -> Res
 
     let lines: Vec<&str> = content.lines().collect();
 
-    // Find all headings with their positions
-    let mut headings: Vec<(usize, usize, String)> = Vec::new(); // (line_num, level, text)
-    for (i, line) in lines.iter().enumerate() {
-        if let Some(caps) = HEADING_RE.captures(line) {
-            let level = caps
-                .get(1)
-                .ok_or_else(|| SkillcError::Internal("regex group 1 missing".into()))?
-                .as_str()
-                .len();
-            let text = caps
-                .get(2)
-                .ok_or_else(|| SkillcError::Internal("regex group 2 missing".into()))?
-                .as_str()
-                .to_string();
-            headings.push((i, level, text));
-        }
+    // Find all headings with their positions using AST parsing.
+    // This avoids false positives from code blocks.
+    let mut headings: Vec<(usize, usize, String)> = Vec::new(); // (line_idx, level, text)
+    for heading in markdown::extract_headings(&content) {
+        let line_idx = heading.line.saturating_sub(1);
+        headings.push((line_idx, heading.level, heading.text));
     }
 
     if headings.is_empty() {
@@ -716,6 +703,7 @@ fn build_fts_query(query: &str) -> String {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use tempfile::TempDir;
 
     #[test]
     fn test_build_fts_query_simple() {
@@ -744,4 +732,56 @@ mod tests {
         let hash = compute_hash16(&path);
         assert_eq!(hash.len(), 16);
     }
+
+    #[test]
+    fn test_index_markdown_ignores_code_block_headings() {
+        let temp = TempDir::new().unwrap();
+        let source_dir = temp.path();
+        let file_path = source_dir.join("perf.md");
+        let content = r#"# Title
+
+## Typst Performance Profiling
+
+```md
+## Not a heading
+```
+
+More text
+
+## Next Section
+Text
+"#;
+        std::fs::write(&file_path, content).unwrap();
+
+        let conn = Connection::open_in_memory().unwrap();
+        conn.execute(
+            "CREATE VIRTUAL TABLE sections USING fts5(file, section, content, tokenize='unicode61')",
+            [],
+        )
+        .unwrap();
+        conn.execute(
+            "CREATE TABLE headings (
+                id INTEGER PRIMARY KEY,
+                file TEXT NOT NULL,
+                text TEXT NOT NULL,
+                level INTEGER NOT NULL,
+                start_line INTEGER NOT NULL,
+                end_line INTEGER NOT NULL
+            )",
+            [],
+        )
+        .unwrap();
+        conn.execute(
+            "CREATE INDEX idx_headings_text ON headings(text COLLATE NOCASE)",
+            [],
+        )
+        .unwrap();
+
+        index_markdown(&conn, source_dir, &file_path).unwrap();
+
+        let headings = index::query_headings(&conn, "Typst Performance Profiling", None).unwrap();
+        assert_eq!(headings.len(), 1);
+        assert_eq!(headings[0].start_line, 3);
+        assert_eq!(headings[0].end_line, 11);
+    }
 }