Skip to content

Commit d526056

Browse files
committed
fix(search): use AST parsing for index heading extraction
Regex-based heading detection in index_markdown() incorrectly captured `#` lines inside code blocks as headings, causing `skc show` to return truncated section content. Switched to pulldown-cmark AST parsing (via markdown::extract_headings) which properly ignores code blocks. Added test coverage for code-block false positives.
1 parent 251c670 commit d526056

File tree

5 files changed

+118
-56
lines changed

5 files changed

+118
-56
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
### Fixed
11+
12+
- Index-based section extraction now correctlyignores headings inside code blocks (WI-2026-02-02-001)
13+
1014
## [0.2.0] - 2026-01-31
1115

1216
### Added

Cargo.lock

Lines changed: 35 additions & 35 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "skillc"
3-
version = "0.2.0"
3+
version = "0.2.1"
44
edition = "2024"
55
license = "MIT OR Apache-2.0"
66
description = "A development kit for Agent Skills - the open format for extending AI agent capabilities"
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
[govctl]
2+
schema = 1
3+
id = "WI-2026-02-02-001"
4+
title = "fix typst section truncation"
5+
status = "done"
6+
created = "2026-02-02"
7+
started = "2026-02-02"
8+
completed = "2026-02-02"
9+
10+
[content]
11+
description = """
12+
Fix index-based section extraction to ignore headings inside code blocks so
13+
`skc show` returns full section content for typst docs."""
14+
15+
[[content.acceptance_criteria]]
16+
text = "Index-based section extraction now correctlyignores headings inside code blocks"
17+
status = "done"
18+
category = "fixed"

src/search.rs

Lines changed: 60 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,11 @@ use crate::config::{ensure_dir, get_cwd};
66
use crate::error::{Result, SkillcError};
77
use crate::index::{self, SCHEMA_VERSION};
88
use crate::logging::{LogEntry, get_run_id, init_log_db, log_access_with_fallback};
9+
use crate::markdown;
910
use crate::resolver::{ResolvedSkill, resolve_skill};
1011
use crate::{OutputFormat, verbose};
1112
use chrono::Utc;
1213
use crossterm::style::Stylize;
13-
use lazy_regex::{Lazy, Regex, lazy_regex};
1414
use rusqlite::{Connection, params};
1515
use serde::Serialize;
1616
use std::fs;
@@ -19,9 +19,6 @@ use std::path::{Path, PathBuf};
1919
use std::time::Instant;
2020
use walkdir::WalkDir;
2121

22-
/// Regex for parsing markdown headings (validated at compile time).
23-
static HEADING_RE: Lazy<Regex> = lazy_regex!(r"^(#{1,6})\s+(.+)$");
24-
2522
/// Search result entry.
2623
#[derive(Debug, Serialize)]
2724
pub struct SearchResult {
@@ -356,22 +353,12 @@ fn index_markdown(conn: &Connection, source_dir: &Path, file_path: &Path) -> Res
356353

357354
let lines: Vec<&str> = content.lines().collect();
358355

359-
// Find all headings with their positions
360-
let mut headings: Vec<(usize, usize, String)> = Vec::new(); // (line_num, level, text)
361-
for (i, line) in lines.iter().enumerate() {
362-
if let Some(caps) = HEADING_RE.captures(line) {
363-
let level = caps
364-
.get(1)
365-
.ok_or_else(|| SkillcError::Internal("regex group 1 missing".into()))?
366-
.as_str()
367-
.len();
368-
let text = caps
369-
.get(2)
370-
.ok_or_else(|| SkillcError::Internal("regex group 2 missing".into()))?
371-
.as_str()
372-
.to_string();
373-
headings.push((i, level, text));
374-
}
356+
// Find all headings with their positions using AST parsing.
357+
// This avoids false positives from code blocks.
358+
let mut headings: Vec<(usize, usize, String)> = Vec::new(); // (line_idx, level, text)
359+
for heading in markdown::extract_headings(&content) {
360+
let line_idx = heading.line.saturating_sub(1);
361+
headings.push((line_idx, heading.level, heading.text));
375362
}
376363

377364
if headings.is_empty() {
@@ -716,6 +703,7 @@ fn build_fts_query(query: &str) -> String {
716703
#[cfg(test)]
717704
mod tests {
718705
use super::*;
706+
use tempfile::TempDir;
719707

720708
#[test]
721709
fn test_build_fts_query_simple() {
@@ -744,4 +732,56 @@ mod tests {
744732
let hash = compute_hash16(&path);
745733
assert_eq!(hash.len(), 16);
746734
}
735+
736+
#[test]
737+
fn test_index_markdown_ignores_code_block_headings() {
738+
let temp = TempDir::new().unwrap();
739+
let source_dir = temp.path();
740+
let file_path = source_dir.join("perf.md");
741+
let content = r#"# Title
742+
743+
## Typst Performance Profiling
744+
745+
```md
746+
## Not a heading
747+
```
748+
749+
More text
750+
751+
## Next Section
752+
Text
753+
"#;
754+
std::fs::write(&file_path, content).unwrap();
755+
756+
let conn = Connection::open_in_memory().unwrap();
757+
conn.execute(
758+
"CREATE VIRTUAL TABLE sections USING fts5(file, section, content, tokenize='unicode61')",
759+
[],
760+
)
761+
.unwrap();
762+
conn.execute(
763+
"CREATE TABLE headings (
764+
id INTEGER PRIMARY KEY,
765+
file TEXT NOT NULL,
766+
text TEXT NOT NULL,
767+
level INTEGER NOT NULL,
768+
start_line INTEGER NOT NULL,
769+
end_line INTEGER NOT NULL
770+
)",
771+
[],
772+
)
773+
.unwrap();
774+
conn.execute(
775+
"CREATE INDEX idx_headings_text ON headings(text COLLATE NOCASE)",
776+
[],
777+
)
778+
.unwrap();
779+
780+
index_markdown(&conn, source_dir, &file_path).unwrap();
781+
782+
let headings = index::query_headings(&conn, "Typst Performance Profiling", None).unwrap();
783+
assert_eq!(headings.len(), 1);
784+
assert_eq!(headings[0].start_line, 3);
785+
assert_eq!(headings[0].end_line, 11);
786+
}
747787
}

0 commit comments

Comments
 (0)