Add parseAnchors function (#5)

frankharkins · web-flow · commit b898dae03713 · 2026-01-12T15:41:04.000Z
* Add `parseAnchors` function

* clippy
diff --git a/Cargo.toml b/Cargo.toml
@@ -9,6 +9,7 @@ version = "0.1.0"
 crate-type = ["cdylib"]
 
 [dependencies]
+fancy-regex = "0.17.0"
 markdown    = "1.0.0"
 napi        = { version = "3.0.0", features = ["async"] }
 napi-derive = "3.0.0"
diff --git a/__test__/index.spec.ts b/__test__/index.spec.ts
@@ -1,6 +1,6 @@
 import test from 'ava'
 
-import { extractLinks, extractLinksFromFile } from '../index'
+import { extractLinks, extractAnchors, extractLinksFromFile } from '../index'
 
 const dedent = (s: string) => s.replace('\n    ', '')
 
@@ -154,3 +154,34 @@ test('extractLinksFromFile: invalid notebook (bad schema)', async (t) => {
     'Could not read "__test__/fixtures/invalid-notebook-schema.ipynb": missing field `source` at line 10 column 5',
   )
 })
+
+test('extractAnchors: no anchors', (t) => {
+  t.deepEqual(extractAnchors(''), [])
+})
+
+test('extractAnchors: simple heading', (t) => {
+  t.deepEqual(extractAnchors('# My heading'), ['#my-heading'])
+})
+
+test('extractAnchors: duplicate headings', (t) => {
+  t.deepEqual(
+    extractAnchors('# My heading\n\n## My heading\n\n### My heading').sort(),
+    ['#my-heading', '#my-heading-1', '#my-heading-2'].sort(),
+  )
+})
+
+test('extractAnchors: markdown in headings', (t) => {
+  t.deepEqual(extractAnchors('# My **heading**'), ['#my-**heading**'])
+})
+
+test('extractAnchors: forbidden characters', (t) => {
+  t.deepEqual(extractAnchors('## A heading with crazy punctuation.,;:!?`\()"\\'), ['#a-heading-with-crazy-punctuation'])
+})
+
+test('extractAnchors: id tags', (t) => {
+  t.deepEqual(extractAnchors('<id="thing">'), ['#thing'])
+})
+
+test('extractAnchors: duplicate id tags', (t) => {
+  t.deepEqual(extractAnchors('<id="thing">\n\n<id="thing">'), ['#thing'])
+})
diff --git a/index.d.ts b/index.d.ts
@@ -1,5 +1,12 @@
 /* auto-generated by NAPI-RS */
 /* eslint-disable */
+/**
+ * Extract anchors from a markdown string. Anchors are either:
+ *  * slugified headings, deduplicated if the same heading appears more than once
+ *  * `id` props of HTML tags. These are not deduplicated as they should be unique per file
+ */
+export declare function extractAnchors(markdown: string): Array<string>
+
 /**
  * Extract links from a markdown string. Supports GitHub-flavored markdown
  * (gfm), math, and JSX.
diff --git a/index.js b/index.js
@@ -572,5 +572,6 @@ if (!nativeBinding) {
 }
 
 module.exports = nativeBinding
+module.exports.extractAnchors = nativeBinding.extractAnchors
 module.exports.extractLinks = nativeBinding.extractLinks
 module.exports.extractLinksFromFile = nativeBinding.extractLinksFromFile
diff --git a/src/anchors/mod.rs b/src/anchors/mod.rs
@@ -0,0 +1,62 @@
+use fancy_regex::Regex;
+use std::collections::HashMap;
+
+pub fn extract_anchors_from_ref(markdown: &str) -> Vec<String> {
+  let heading_regex = Regex::new("^\\s*#{1,6}\\s+(.+?)\\s*$").unwrap();
+  let id_regex = Regex::new("(?<=id=\")(.+?)(?=\")").unwrap();
+
+  let mut anchor_occurrences = HashMap::<String, u32>::default();
+  for line in markdown.split("\n") {
+    if let Some(heading) = get_first_capture(line, &heading_regex) {
+      let anchor = heading_to_anchor(heading);
+      let existing_duplicates = anchor_occurrences.get(&anchor).unwrap_or(&0);
+      anchor_occurrences.insert(anchor, *existing_duplicates + 1);
+    }
+    if let Some(id) = get_first_capture(line, &id_regex) {
+      if !anchor_occurrences.contains_key(id) {
+        anchor_occurrences.insert(id.to_string(), 1);
+      }
+    }
+  }
+
+  anchor_occurrences
+    .into_iter()
+    .flat_map(|(anchor, duplications)| {
+      (0..duplications).map(move |n| match n {
+        0 => format!("#{anchor}"),
+        n => format!("#{anchor}-{n}"),
+      })
+    })
+    .collect()
+}
+
+fn heading_to_anchor(heading: &str) -> String {
+  heading
+    .trim()
+    .to_lowercase()
+    .chars()
+    .filter_map(|c| match c {
+      ' ' => Some('-'),
+      '.' => None,
+      ',' => None,
+      ';' => None,
+      ':' => None,
+      '!' => None,
+      '?' => None,
+      '`' => None,
+      '\\' => None,
+      '(' => None,
+      ')' => None,
+      '"' => None,
+      '\'' => None,
+      x => Some(x),
+    })
+    .collect()
+}
+
+fn get_first_capture<'a>(s: &'a str, r: &Regex) -> Option<&'a str> {
+  let Ok(Some(captures)) = r.captures(s) else {
+    return None;
+  };
+  Some(captures.get(1)?.as_str())
+}
diff --git a/src/lib.rs b/src/lib.rs
@@ -1,5 +1,3 @@
-#![deny(clippy::all)]
-
 use markdown::mdast::{AttributeContent, AttributeValue, MdxJsxTextElement};
 use markdown::{mdast::Node, to_mdast, Constructs, ParseOptions};
 use napi::Error;
@@ -36,6 +34,18 @@ pub async fn extract_links_from_file(file_path: String) -> Result<Vec<String>, E
   extract_links(markdown)
 }
 
+use crate::anchors::extract_anchors_from_ref;
+
+mod anchors;
+
+/// Extract anchors from a markdown string. Anchors are either:
+///  * slugified headings, deduplicated if the same heading appears more than once
+///  * `id` props of HTML tags. These are not deduplicated as they should be unique per file
+#[napi]
+pub fn extract_anchors(markdown: String) -> Vec<String> {
+  extract_anchors_from_ref(&markdown)
+}
+
 /// Extract links from a markdown string. Supports GitHub-flavored markdown
 /// (gfm), math, and JSX.
 #[napi]

Original file line number	Diff line number	Diff line change
`@@ -572,5 +572,6 @@ if (!nativeBinding) {`
`572`	`572`	`}`
`573`	`573`
`574`	`574`	`module.exports = nativeBinding`
	`575`	`+module.exports.extractAnchors = nativeBinding.extractAnchors`
`575`	`576`	`module.exports.extractLinks = nativeBinding.extractLinks`
`576`	`577`	`module.exports.extractLinksFromFile = nativeBinding.extractLinksFromFile`