diff --git a/Cargo.toml b/Cargo.toml
index 12cea95..748dee2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,7 +9,6 @@ version = "0.1.0"
crate-type = ["cdylib"]
[dependencies]
-fancy-regex = "0.17.0"
markdown = "1.0.0"
napi = { version = "3.0.0", features = ["async"] }
napi-derive = "3.0.0"
diff --git a/README.md b/README.md
index 95635db..9fb0e9d 100644
--- a/README.md
+++ b/README.md
@@ -49,5 +49,4 @@ Follow these steps for a foolproof way to make a new release:
3. Run `gh pr create` and accept the defaults. This will create a PR with the correct title. Submit the PR.
4. Once CI has passed and been approved, merge the PR. This will trigger a release.
-
> WARN: Don't run `npm publish` manually.
diff --git a/__test__/index.spec.ts b/__test__/index.spec.ts
index 2a549cb..a26ffbe 100644
--- a/__test__/index.spec.ts
+++ b/__test__/index.spec.ts
@@ -167,8 +167,26 @@ test('extractAnchors: duplicate headings', (t) => {
)
})
-test('extractAnchors: markdown in headings', (t) => {
- t.deepEqual(extractAnchors('# My **heading**'), ['#my-**heading**'])
+test('extractAnchors: bold text in headings', (t) => {
+ t.deepEqual(extractAnchors('# My **heading**'), ['#my-heading'])
+})
+
+test('extractAnchors: code in headings', (t) => {
+ t.deepEqual(extractAnchors('# My `heading`'), ['#my-heading'])
+})
+
+test('extractAnchors: math in headings', (t) => {
+ t.deepEqual(extractAnchors('## Gates $\\rightarrow$ quantum gates'), ['#gates-rightarrow-quantum-gates'])
+ t.deepEqual(
+ extractAnchors(
+ '### Template circuits for calculating matrix elements of $\\tilde{S}$ and $\\tilde{H}$ via Hadamard test',
+ ),
+ ['#template-circuits-for-calculating-matrix-elements-of-tildes-and-tildeh-via-hadamard-test'],
+ )
+})
+
+test('extractAnchors: mdx in headings', (t) => {
+ t.deepEqual(extractAnchors('# My heading`'), ['#my-heading'])
})
test('extractAnchors: forbidden characters', (t) => {
@@ -176,13 +194,15 @@ test('extractAnchors: forbidden characters', (t) => {
})
test('extractAnchors: id tags', (t) => {
- t.deepEqual(extractAnchors(''), ['#thing'])
+ t.deepEqual(extractAnchors(''), ['#thing'])
})
test('extractAnchors: duplicate id tags', (t) => {
- t.deepEqual(extractAnchors('\n\n'), ['#thing'])
+ t.deepEqual(extractAnchors('\n\n'), ['#thing'])
})
test('extractAnchors: headings with links', (t) => {
- t.deepEqual(extractAnchors('# My [heading](/test1) with [multiple links](/test2)'), ['#my-heading-with-multiple-links'])
+ t.deepEqual(extractAnchors('# My [heading](/test1) with [multiple links](/test2)'), [
+ '#my-heading-with-multiple-links',
+ ])
})
diff --git a/src/anchors.rs b/src/anchors.rs
new file mode 100644
index 0000000..06033e1
--- /dev/null
+++ b/src/anchors.rs
@@ -0,0 +1,105 @@
+use markdown::mdast::{AttributeContent, AttributeValue, Heading, Node};
+use std::collections::HashMap;
+
+/// If `node` is a heading or mdx element with `id` prop, extract the heading
+/// text and add it to `anchor_occurences`
+pub fn extract_from_node(node: &Node, anchor_occurrences: &mut HashMap) {
+ match node {
+ Node::Heading(h) => {
+ let anchor = anchor_from_heading(h);
+ let existing_duplicates = anchor_occurrences.get(&anchor).unwrap_or(&0);
+ anchor_occurrences.insert(anchor, *existing_duplicates + 1);
+ }
+ Node::MdxJsxFlowElement(el) => {
+ if let Some(anchor) = get_id_prop(&el.attributes) {
+ anchor_occurrences.insert(anchor, 1);
+ };
+ }
+ Node::MdxJsxTextElement(el) => {
+ if let Some(anchor) = get_id_prop(&el.attributes) {
+ anchor_occurrences.insert(anchor, 1);
+ };
+ }
+ _ => (),
+ };
+}
+
+pub fn deduplicate_anchors(anchor_occurrences: HashMap) -> Vec {
+ anchor_occurrences
+ .into_iter()
+ .flat_map(|(anchor, duplications)| {
+ (0..duplications).map(move |n| match n {
+ 0 => format!("#{anchor}"),
+ n => format!("#{anchor}-{n}"),
+ })
+ })
+ .collect()
+}
+
+fn get_id_prop(attributes: &Vec) -> Option {
+ for attr in attributes.iter() {
+ let AttributeContent::Property(prop) = attr else {
+ continue;
+ };
+ if prop.name != "id" {
+ continue;
+ };
+ if let Some(AttributeValue::Literal(text)) = prop.value.clone() {
+ return Some(text);
+ }
+ }
+ return None;
+}
+
+fn anchor_from_heading(heading: &Heading) -> String {
+ let mut text = String::with_capacity(100);
+ for child in heading.children.iter() {
+ get_text(child, &mut text);
+ }
+ heading_to_anchor(text)
+}
+
+/// Get plain text from a node and all its children
+pub fn get_text<'a>(node: &'a Node, s: &mut String) {
+ let maybe_text = match node {
+ Node::Text(text) => Some(&text.value),
+ Node::InlineCode(text) => Some(&text.value),
+ Node::InlineMath(text) => Some(&text.value),
+ _ => None,
+ };
+ if let Some(text) = maybe_text {
+ s.push_str(text.as_str())
+ };
+
+ if let Some(children) = node.children() {
+ for child in children {
+ get_text(child, s);
+ }
+ }
+}
+
+fn heading_to_anchor(heading: String) -> String {
+ heading
+ .trim()
+ .to_lowercase()
+ .chars()
+ .filter_map(|c| match c {
+ ' ' => Some('-'),
+ '.' => None,
+ ',' => None,
+ ';' => None,
+ ':' => None,
+ '!' => None,
+ '?' => None,
+ '`' => None,
+ '\\' => None,
+ '(' => None,
+ ')' => None,
+ '"' => None,
+ '\'' => None,
+ '{' => None,
+ '}' => None,
+ x => Some(x),
+ })
+ .collect()
+}
diff --git a/src/anchors/mod.rs b/src/anchors/mod.rs
deleted file mode 100644
index 483f575..0000000
--- a/src/anchors/mod.rs
+++ /dev/null
@@ -1,72 +0,0 @@
-use fancy_regex::Regex;
-use std::collections::HashMap;
-
-pub fn extract_anchors_from_ref(markdown: &str) -> Vec {
- let heading_regex = Regex::new("^\\s*#{1,6}\\s+(.+?)\\s*$").unwrap();
- let id_regex = Regex::new("(?<=id=\")(.+?)(?=\")").unwrap();
-
- let mut anchor_occurrences = HashMap::::default();
- for line in markdown.split("\n") {
- if let Some(heading) = get_first_capture(line, &heading_regex) {
- let anchor = heading_to_anchor(heading);
- let existing_duplicates = anchor_occurrences.get(&anchor).unwrap_or(&0);
- anchor_occurrences.insert(anchor, *existing_duplicates + 1);
- }
- if let Some(id) = get_first_capture(line, &id_regex) {
- if !anchor_occurrences.contains_key(id) {
- anchor_occurrences.insert(id.to_string(), 1);
- }
- }
- }
-
- anchor_occurrences
- .into_iter()
- .flat_map(|(anchor, duplications)| {
- (0..duplications).map(move |n| match n {
- 0 => format!("#{anchor}"),
- n => format!("#{anchor}-{n}"),
- })
- })
- .collect()
-}
-
-fn heading_to_anchor(heading: &str) -> String {
- let heading_without_links = remove_markdown_links(heading);
- heading_without_links
- .trim()
- .to_lowercase()
- .chars()
- .filter_map(|c| match c {
- ' ' => Some('-'),
- '.' => None,
- ',' => None,
- ';' => None,
- ':' => None,
- '!' => None,
- '?' => None,
- '`' => None,
- '\\' => None,
- '(' => None,
- ')' => None,
- '"' => None,
- '\'' => None,
- x => Some(x),
- })
- .collect()
-}
-
-fn get_first_capture<'a>(s: &'a str, r: &Regex) -> Option<&'a str> {
- let Ok(Some(captures)) = r.captures(s) else {
- return None;
- };
- Some(captures.get(1)?.as_str())
-}
-
-/// Extracts the text inside every markdown link found in `markdown`.
-///
-/// Example:
-/// "My [heading with links](/test)" -> "My heading with links"
-fn remove_markdown_links(markdown: &str) -> String {
- let re = Regex::new(r"\[([^\[\]]+)\]\(([^)]+)\)").unwrap();
- re.replace_all(markdown, "$1").to_string()
-}
diff --git a/src/lib.rs b/src/lib.rs
index b1a06e8..ceef312 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,12 +1,14 @@
+use std::collections::{HashMap, HashSet};
+
use napi::Error;
use napi_derive::napi;
use tokio::fs;
-use crate::anchors::extract_anchors_from_ref;
use crate::notebook::extract_markdown_from_notebook_source;
mod anchors;
mod links;
+mod mdx;
mod notebook;
fn file_read_error(path: String, reason: String) -> Error {
@@ -37,24 +39,42 @@ pub async fn extract_from_file(file_path: String) -> Result>, Er
source
};
- let anchors = extract_anchors_from_ref(&markdown);
- match extract_links(markdown) {
- Ok(links) => Ok(vec![links, anchors]),
- Err(e) => Err(Error::from_reason(e.to_string())),
- }
+ let ast_root = mdx::parse_mdx(markdown)?;
+
+ let mut link_set = HashSet::<&String>::default();
+ let mut anchor_occurrences = HashMap::::default();
+ mdx::walk_ast(&ast_root, &mut |node| {
+ links::extract_from_node(node, &mut link_set);
+ anchors::extract_from_node(node, &mut anchor_occurrences);
+ });
+
+ Ok(vec![
+ link_set.into_iter().cloned().collect(),
+ anchors::deduplicate_anchors(anchor_occurrences),
+ ])
}
/// Extract anchors from a markdown string. Anchors are either:
/// * slugified headings, deduplicated if the same heading appears more than once
/// * `id` props of HTML tags. These are not deduplicated as they should be unique per file
#[napi]
-pub fn extract_anchors(markdown: String) -> Vec {
- extract_anchors_from_ref(&markdown)
+pub fn extract_anchors(markdown: String) -> Result, Error> {
+ let ast_root = mdx::parse_mdx(markdown)?;
+ let mut anchor_occurrences = HashMap::::default();
+ mdx::walk_ast(&ast_root, &mut |node| {
+ anchors::extract_from_node(node, &mut anchor_occurrences)
+ });
+ Ok(anchors::deduplicate_anchors(anchor_occurrences))
}
/// Extract links from a markdown string. Supports GitHub-flavored markdown
/// (gfm), math, and JSX.
#[napi]
pub fn extract_links(markdown: String) -> Result, Error> {
- links::extract_links(markdown)
+ let ast_root = mdx::parse_mdx(markdown)?;
+ let mut links = HashSet::<&String>::default();
+ mdx::walk_ast(&ast_root, &mut |node| {
+ links::extract_from_node(node, &mut links)
+ });
+ Ok(links.into_iter().cloned().collect())
}
diff --git a/src/links.rs b/src/links.rs
new file mode 100644
index 0000000..dceaf3d
--- /dev/null
+++ b/src/links.rs
@@ -0,0 +1,29 @@
+use markdown::mdast::Node;
+use markdown::mdast::{AttributeContent, AttributeValue, MdxJsxTextElement};
+use std::collections::HashSet;
+
+pub fn extract_from_node<'a>(node: &'a Node, links: &mut HashSet<&'a String>) {
+ let maybe_link = match node {
+ Node::Image(img) => Some(&img.url),
+ Node::Link(link) => Some(&link.url),
+ Node::MdxJsxTextElement(el) => extract_from_jsx_text_element(el),
+ _ => None,
+ };
+
+ if let Some(link) = maybe_link {
+ links.insert(link);
+ }
+}
+
+fn extract_from_jsx_text_element(el: &MdxJsxTextElement) -> Option<&String> {
+ let Some(Some(href_attr)) = el.attributes.iter().find_map(|attr| match attr {
+ AttributeContent::Property(p) if p.name == "href" => Some(&p.value),
+ _ => None,
+ }) else {
+ return None;
+ };
+ match href_attr {
+ AttributeValue::Literal(s) => Some(s),
+ _ => None,
+ }
+}
diff --git a/src/links/mod.rs b/src/links/mod.rs
deleted file mode 100644
index 351e776..0000000
--- a/src/links/mod.rs
+++ /dev/null
@@ -1,65 +0,0 @@
-use markdown::mdast::{AttributeContent, AttributeValue, MdxJsxTextElement};
-use markdown::{mdast::Node, to_mdast, Constructs, ParseOptions};
-use napi::Error;
-use std::collections::HashSet;
-
-pub fn extract_links(markdown: String) -> Result, Error> {
- let options = ParseOptions {
- constructs: Constructs {
- gfm_autolink_literal: true,
- gfm_footnote_definition: true,
- gfm_label_start_footnote: true,
- gfm_strikethrough: true,
- gfm_table: true,
- gfm_task_list_item: true,
- math_flow: true,
- math_text: true,
- mdx_jsx_flow: true,
- mdx_jsx_text: true,
- ..Constructs::mdx()
- },
- ..ParseOptions::mdx()
- };
-
- let ast = match to_mdast(markdown.as_str(), &options) {
- Ok(ast) => ast,
- Err(m) => return Err(Error::from_reason(m.to_string())),
- };
-
- let mut links = HashSet::<&String>::default();
- extract_from_node(&ast, &mut links);
-
- Ok(links.into_iter().cloned().collect())
-}
-
-fn extract_from_node<'a>(node: &'a Node, links: &mut HashSet<&'a String>) {
- let maybe_link = match node {
- Node::Image(img) => Some(&img.url),
- Node::Link(link) => Some(&link.url),
- Node::MdxJsxTextElement(el) => extract_from_jsx_text_element(el),
- _ => None,
- };
-
- if let Some(link) = maybe_link {
- links.insert(link);
- }
-
- if let Some(children) = node.children() {
- for child in children {
- extract_from_node(child, links);
- }
- }
-}
-
-fn extract_from_jsx_text_element(el: &MdxJsxTextElement) -> Option<&String> {
- let Some(Some(href_attr)) = el.attributes.iter().find_map(|attr| match attr {
- AttributeContent::Property(p) if p.name == "href" => Some(&p.value),
- _ => None,
- }) else {
- return None;
- };
- match href_attr {
- AttributeValue::Literal(s) => Some(s),
- _ => None,
- }
-}
diff --git a/src/mdx.rs b/src/mdx.rs
new file mode 100644
index 0000000..82a3762
--- /dev/null
+++ b/src/mdx.rs
@@ -0,0 +1,34 @@
+use markdown::{mdast::Node, to_mdast, Constructs, ParseOptions};
+use napi::Error;
+
+pub fn parse_mdx(source: String) -> Result {
+ let options = ParseOptions {
+ constructs: Constructs {
+ gfm_autolink_literal: true,
+ gfm_footnote_definition: true,
+ gfm_label_start_footnote: true,
+ gfm_strikethrough: true,
+ gfm_table: true,
+ gfm_task_list_item: true,
+ math_flow: true,
+ math_text: true,
+ mdx_jsx_flow: true,
+ mdx_jsx_text: true,
+ ..Constructs::mdx()
+ },
+ ..ParseOptions::mdx()
+ };
+
+ to_mdast(source.as_str(), &options).map_err(|message| Error::from_reason(message.to_string()))
+}
+
+/// Walk the markdown AST and call a function on each node
+pub fn walk_ast<'a>(node: &'a Node, f: &mut impl FnMut(&'a Node) -> ()) -> () {
+ f(node);
+
+ if let Some(children) = node.children() {
+ for child in children {
+ walk_ast(child, f);
+ }
+ }
+}
diff --git a/src/notebook/mod.rs b/src/notebook.rs
similarity index 100%
rename from src/notebook/mod.rs
rename to src/notebook.rs