diff --git a/Cargo.toml b/Cargo.toml index 12cea95..748dee2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,6 @@ version = "0.1.0" crate-type = ["cdylib"] [dependencies] -fancy-regex = "0.17.0" markdown = "1.0.0" napi = { version = "3.0.0", features = ["async"] } napi-derive = "3.0.0" diff --git a/README.md b/README.md index 95635db..9fb0e9d 100644 --- a/README.md +++ b/README.md @@ -49,5 +49,4 @@ Follow these steps for a foolproof way to make a new release: 3. Run `gh pr create` and accept the defaults. This will create a PR with the correct title. Submit the PR. 4. Once CI has passed and been approved, merge the PR. This will trigger a release. - > WARN: Don't run `npm publish` manually. diff --git a/__test__/index.spec.ts b/__test__/index.spec.ts index 2a549cb..a26ffbe 100644 --- a/__test__/index.spec.ts +++ b/__test__/index.spec.ts @@ -167,8 +167,26 @@ test('extractAnchors: duplicate headings', (t) => { ) }) -test('extractAnchors: markdown in headings', (t) => { - t.deepEqual(extractAnchors('# My **heading**'), ['#my-**heading**']) +test('extractAnchors: bold text in headings', (t) => { + t.deepEqual(extractAnchors('# My **heading**'), ['#my-heading']) +}) + +test('extractAnchors: code in headings', (t) => { + t.deepEqual(extractAnchors('# My `heading`'), ['#my-heading']) +}) + +test('extractAnchors: math in headings', (t) => { + t.deepEqual(extractAnchors('## Gates $\\rightarrow$ quantum gates'), ['#gates-rightarrow-quantum-gates']) + t.deepEqual( + extractAnchors( + '### Template circuits for calculating matrix elements of $\\tilde{S}$ and $\\tilde{H}$ via Hadamard test', + ), + ['#template-circuits-for-calculating-matrix-elements-of-tildes-and-tildeh-via-hadamard-test'], + ) +}) + +test('extractAnchors: mdx in headings', (t) => { + t.deepEqual(extractAnchors('# My heading`'), ['#my-heading']) }) test('extractAnchors: forbidden characters', (t) => { @@ -176,13 +194,15 @@ test('extractAnchors: forbidden characters', (t) => { }) test('extractAnchors: id tags', (t) => { - t.deepEqual(extractAnchors(''), ['#thing']) + t.deepEqual(extractAnchors(''), ['#thing']) }) test('extractAnchors: duplicate id tags', (t) => { - t.deepEqual(extractAnchors('\n\n'), ['#thing']) + t.deepEqual(extractAnchors('\n\n'), ['#thing']) }) test('extractAnchors: headings with links', (t) => { - t.deepEqual(extractAnchors('# My [heading](/test1) with [multiple links](/test2)'), ['#my-heading-with-multiple-links']) + t.deepEqual(extractAnchors('# My [heading](/test1) with [multiple links](/test2)'), [ + '#my-heading-with-multiple-links', + ]) }) diff --git a/src/anchors.rs b/src/anchors.rs new file mode 100644 index 0000000..06033e1 --- /dev/null +++ b/src/anchors.rs @@ -0,0 +1,105 @@ +use markdown::mdast::{AttributeContent, AttributeValue, Heading, Node}; +use std::collections::HashMap; + +/// If `node` is a heading or mdx element with `id` prop, extract the heading +/// text and add it to `anchor_occurences` +pub fn extract_from_node(node: &Node, anchor_occurrences: &mut HashMap) { + match node { + Node::Heading(h) => { + let anchor = anchor_from_heading(h); + let existing_duplicates = anchor_occurrences.get(&anchor).unwrap_or(&0); + anchor_occurrences.insert(anchor, *existing_duplicates + 1); + } + Node::MdxJsxFlowElement(el) => { + if let Some(anchor) = get_id_prop(&el.attributes) { + anchor_occurrences.insert(anchor, 1); + }; + } + Node::MdxJsxTextElement(el) => { + if let Some(anchor) = get_id_prop(&el.attributes) { + anchor_occurrences.insert(anchor, 1); + }; + } + _ => (), + }; +} + +pub fn deduplicate_anchors(anchor_occurrences: HashMap) -> Vec { + anchor_occurrences + .into_iter() + .flat_map(|(anchor, duplications)| { + (0..duplications).map(move |n| match n { + 0 => format!("#{anchor}"), + n => format!("#{anchor}-{n}"), + }) + }) + .collect() +} + +fn get_id_prop(attributes: &Vec) -> Option { + for attr in attributes.iter() { + let AttributeContent::Property(prop) = attr else { + continue; + }; + if prop.name != "id" { + continue; + }; + if let Some(AttributeValue::Literal(text)) = prop.value.clone() { + return Some(text); + } + } + return None; +} + +fn anchor_from_heading(heading: &Heading) -> String { + let mut text = String::with_capacity(100); + for child in heading.children.iter() { + get_text(child, &mut text); + } + heading_to_anchor(text) +} + +/// Get plain text from a node and all its children +pub fn get_text<'a>(node: &'a Node, s: &mut String) { + let maybe_text = match node { + Node::Text(text) => Some(&text.value), + Node::InlineCode(text) => Some(&text.value), + Node::InlineMath(text) => Some(&text.value), + _ => None, + }; + if let Some(text) = maybe_text { + s.push_str(text.as_str()) + }; + + if let Some(children) = node.children() { + for child in children { + get_text(child, s); + } + } +} + +fn heading_to_anchor(heading: String) -> String { + heading + .trim() + .to_lowercase() + .chars() + .filter_map(|c| match c { + ' ' => Some('-'), + '.' => None, + ',' => None, + ';' => None, + ':' => None, + '!' => None, + '?' => None, + '`' => None, + '\\' => None, + '(' => None, + ')' => None, + '"' => None, + '\'' => None, + '{' => None, + '}' => None, + x => Some(x), + }) + .collect() +} diff --git a/src/anchors/mod.rs b/src/anchors/mod.rs deleted file mode 100644 index 483f575..0000000 --- a/src/anchors/mod.rs +++ /dev/null @@ -1,72 +0,0 @@ -use fancy_regex::Regex; -use std::collections::HashMap; - -pub fn extract_anchors_from_ref(markdown: &str) -> Vec { - let heading_regex = Regex::new("^\\s*#{1,6}\\s+(.+?)\\s*$").unwrap(); - let id_regex = Regex::new("(?<=id=\")(.+?)(?=\")").unwrap(); - - let mut anchor_occurrences = HashMap::::default(); - for line in markdown.split("\n") { - if let Some(heading) = get_first_capture(line, &heading_regex) { - let anchor = heading_to_anchor(heading); - let existing_duplicates = anchor_occurrences.get(&anchor).unwrap_or(&0); - anchor_occurrences.insert(anchor, *existing_duplicates + 1); - } - if let Some(id) = get_first_capture(line, &id_regex) { - if !anchor_occurrences.contains_key(id) { - anchor_occurrences.insert(id.to_string(), 1); - } - } - } - - anchor_occurrences - .into_iter() - .flat_map(|(anchor, duplications)| { - (0..duplications).map(move |n| match n { - 0 => format!("#{anchor}"), - n => format!("#{anchor}-{n}"), - }) - }) - .collect() -} - -fn heading_to_anchor(heading: &str) -> String { - let heading_without_links = remove_markdown_links(heading); - heading_without_links - .trim() - .to_lowercase() - .chars() - .filter_map(|c| match c { - ' ' => Some('-'), - '.' => None, - ',' => None, - ';' => None, - ':' => None, - '!' => None, - '?' => None, - '`' => None, - '\\' => None, - '(' => None, - ')' => None, - '"' => None, - '\'' => None, - x => Some(x), - }) - .collect() -} - -fn get_first_capture<'a>(s: &'a str, r: &Regex) -> Option<&'a str> { - let Ok(Some(captures)) = r.captures(s) else { - return None; - }; - Some(captures.get(1)?.as_str()) -} - -/// Extracts the text inside every markdown link found in `markdown`. -/// -/// Example: -/// "My [heading with links](/test)" -> "My heading with links" -fn remove_markdown_links(markdown: &str) -> String { - let re = Regex::new(r"\[([^\[\]]+)\]\(([^)]+)\)").unwrap(); - re.replace_all(markdown, "$1").to_string() -} diff --git a/src/lib.rs b/src/lib.rs index b1a06e8..ceef312 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,12 +1,14 @@ +use std::collections::{HashMap, HashSet}; + use napi::Error; use napi_derive::napi; use tokio::fs; -use crate::anchors::extract_anchors_from_ref; use crate::notebook::extract_markdown_from_notebook_source; mod anchors; mod links; +mod mdx; mod notebook; fn file_read_error(path: String, reason: String) -> Error { @@ -37,24 +39,42 @@ pub async fn extract_from_file(file_path: String) -> Result>, Er source }; - let anchors = extract_anchors_from_ref(&markdown); - match extract_links(markdown) { - Ok(links) => Ok(vec![links, anchors]), - Err(e) => Err(Error::from_reason(e.to_string())), - } + let ast_root = mdx::parse_mdx(markdown)?; + + let mut link_set = HashSet::<&String>::default(); + let mut anchor_occurrences = HashMap::::default(); + mdx::walk_ast(&ast_root, &mut |node| { + links::extract_from_node(node, &mut link_set); + anchors::extract_from_node(node, &mut anchor_occurrences); + }); + + Ok(vec![ + link_set.into_iter().cloned().collect(), + anchors::deduplicate_anchors(anchor_occurrences), + ]) } /// Extract anchors from a markdown string. Anchors are either: /// * slugified headings, deduplicated if the same heading appears more than once /// * `id` props of HTML tags. These are not deduplicated as they should be unique per file #[napi] -pub fn extract_anchors(markdown: String) -> Vec { - extract_anchors_from_ref(&markdown) +pub fn extract_anchors(markdown: String) -> Result, Error> { + let ast_root = mdx::parse_mdx(markdown)?; + let mut anchor_occurrences = HashMap::::default(); + mdx::walk_ast(&ast_root, &mut |node| { + anchors::extract_from_node(node, &mut anchor_occurrences) + }); + Ok(anchors::deduplicate_anchors(anchor_occurrences)) } /// Extract links from a markdown string. Supports GitHub-flavored markdown /// (gfm), math, and JSX. #[napi] pub fn extract_links(markdown: String) -> Result, Error> { - links::extract_links(markdown) + let ast_root = mdx::parse_mdx(markdown)?; + let mut links = HashSet::<&String>::default(); + mdx::walk_ast(&ast_root, &mut |node| { + links::extract_from_node(node, &mut links) + }); + Ok(links.into_iter().cloned().collect()) } diff --git a/src/links.rs b/src/links.rs new file mode 100644 index 0000000..dceaf3d --- /dev/null +++ b/src/links.rs @@ -0,0 +1,29 @@ +use markdown::mdast::Node; +use markdown::mdast::{AttributeContent, AttributeValue, MdxJsxTextElement}; +use std::collections::HashSet; + +pub fn extract_from_node<'a>(node: &'a Node, links: &mut HashSet<&'a String>) { + let maybe_link = match node { + Node::Image(img) => Some(&img.url), + Node::Link(link) => Some(&link.url), + Node::MdxJsxTextElement(el) => extract_from_jsx_text_element(el), + _ => None, + }; + + if let Some(link) = maybe_link { + links.insert(link); + } +} + +fn extract_from_jsx_text_element(el: &MdxJsxTextElement) -> Option<&String> { + let Some(Some(href_attr)) = el.attributes.iter().find_map(|attr| match attr { + AttributeContent::Property(p) if p.name == "href" => Some(&p.value), + _ => None, + }) else { + return None; + }; + match href_attr { + AttributeValue::Literal(s) => Some(s), + _ => None, + } +} diff --git a/src/links/mod.rs b/src/links/mod.rs deleted file mode 100644 index 351e776..0000000 --- a/src/links/mod.rs +++ /dev/null @@ -1,65 +0,0 @@ -use markdown::mdast::{AttributeContent, AttributeValue, MdxJsxTextElement}; -use markdown::{mdast::Node, to_mdast, Constructs, ParseOptions}; -use napi::Error; -use std::collections::HashSet; - -pub fn extract_links(markdown: String) -> Result, Error> { - let options = ParseOptions { - constructs: Constructs { - gfm_autolink_literal: true, - gfm_footnote_definition: true, - gfm_label_start_footnote: true, - gfm_strikethrough: true, - gfm_table: true, - gfm_task_list_item: true, - math_flow: true, - math_text: true, - mdx_jsx_flow: true, - mdx_jsx_text: true, - ..Constructs::mdx() - }, - ..ParseOptions::mdx() - }; - - let ast = match to_mdast(markdown.as_str(), &options) { - Ok(ast) => ast, - Err(m) => return Err(Error::from_reason(m.to_string())), - }; - - let mut links = HashSet::<&String>::default(); - extract_from_node(&ast, &mut links); - - Ok(links.into_iter().cloned().collect()) -} - -fn extract_from_node<'a>(node: &'a Node, links: &mut HashSet<&'a String>) { - let maybe_link = match node { - Node::Image(img) => Some(&img.url), - Node::Link(link) => Some(&link.url), - Node::MdxJsxTextElement(el) => extract_from_jsx_text_element(el), - _ => None, - }; - - if let Some(link) = maybe_link { - links.insert(link); - } - - if let Some(children) = node.children() { - for child in children { - extract_from_node(child, links); - } - } -} - -fn extract_from_jsx_text_element(el: &MdxJsxTextElement) -> Option<&String> { - let Some(Some(href_attr)) = el.attributes.iter().find_map(|attr| match attr { - AttributeContent::Property(p) if p.name == "href" => Some(&p.value), - _ => None, - }) else { - return None; - }; - match href_attr { - AttributeValue::Literal(s) => Some(s), - _ => None, - } -} diff --git a/src/mdx.rs b/src/mdx.rs new file mode 100644 index 0000000..82a3762 --- /dev/null +++ b/src/mdx.rs @@ -0,0 +1,34 @@ +use markdown::{mdast::Node, to_mdast, Constructs, ParseOptions}; +use napi::Error; + +pub fn parse_mdx(source: String) -> Result { + let options = ParseOptions { + constructs: Constructs { + gfm_autolink_literal: true, + gfm_footnote_definition: true, + gfm_label_start_footnote: true, + gfm_strikethrough: true, + gfm_table: true, + gfm_task_list_item: true, + math_flow: true, + math_text: true, + mdx_jsx_flow: true, + mdx_jsx_text: true, + ..Constructs::mdx() + }, + ..ParseOptions::mdx() + }; + + to_mdast(source.as_str(), &options).map_err(|message| Error::from_reason(message.to_string())) +} + +/// Walk the markdown AST and call a function on each node +pub fn walk_ast<'a>(node: &'a Node, f: &mut impl FnMut(&'a Node) -> ()) -> () { + f(node); + + if let Some(children) = node.children() { + for child in children { + walk_ast(child, f); + } + } +} diff --git a/src/notebook/mod.rs b/src/notebook.rs similarity index 100% rename from src/notebook/mod.rs rename to src/notebook.rs