Extract both links and anchors from file (#6)

frankharkins · web-flow · commit c8165bae1326 · 2026-01-14T18:17:47.000Z
* Extract both links and anchors from file

* Refactor: Move links to own module
diff --git a/__test__/index.spec.ts b/__test__/index.spec.ts
@@ -1,6 +1,6 @@
 import test from 'ava'
 
-import { extractLinks, extractAnchors, extractLinksFromFile } from '../index'
+import { extractLinks, extractAnchors, extractFromFile } from '../index'
 
 const dedent = (s: string) => s.replace('\n    ', '')
 
@@ -109,20 +109,19 @@ test('extractLinks: appropriate jsx error message', (t) => {
   t.is(error.message, '1:13: Expected a closing tag for `<Admonition>` (1:1) (markdown-rs:end-tag-mismatch)')
 })
 
-test('extractLinksFromFile: mdx file', async (t) => {
-  const links = await extractLinksFromFile('__test__/fixtures/markdown.mdx')
-  t.deepEqual(links, ['/path'])
+test('extractFromFile: mdx file', async (t) => {
+  const result = await extractFromFile('__test__/fixtures/markdown.mdx')
+  t.deepEqual(result, [['/path'], ['#example-document']])
 })
 
-test('extractLinksFromFile: notebook', async (t) => {
-  const links = (await extractLinksFromFile('__test__/fixtures/markdown.ipynb')).sort()
-  t.deepEqual(links, ['/path', '/path2'].sort())
+test('extractFromFile: notebook', async (t) => {
+  const [links, anchors] = await extractFromFile('__test__/fixtures/markdown.ipynb')
+  t.deepEqual(links.sort(), ['/path', '/path2'])
+  t.deepEqual(anchors, ['#example-notebook'])
 })
 
-test('extractLinksFromFile: markdown file not found', async (t) => {
-  const error = await t.throwsAsync(
-    async () => await extractLinksFromFile('__test__/fixtures/file_that_does_not_exist.md'),
-  )
+test('extractFromFile: markdown file not found', async (t) => {
+  const error = await t.throwsAsync(async () => await extractFromFile('__test__/fixtures/file_that_does_not_exist.md'))
   t.is(error.name, 'Error')
 
   // The error message changes depending on OS, but both are acceptable
@@ -133,20 +132,18 @@ test('extractLinksFromFile: markdown file not found', async (t) => {
   t.assert(acceptableMessages.includes(error.message))
 })
 
-test('extractLinksFromFile: invalid notebook (not JSON)', async (t) => {
-  const error = await t.throwsAsync(
-    async () => await extractLinksFromFile('__test__/fixtures/invalid-notebook-json.ipynb'),
-  )
+test('extractFromFile: invalid notebook (not JSON)', async (t) => {
+  const error = await t.throwsAsync(async () => await extractFromFile('__test__/fixtures/invalid-notebook-json.ipynb'))
   t.is(error.name, 'Error')
   t.is(
     error.message,
     'Could not read "__test__/fixtures/invalid-notebook-json.ipynb": trailing comma at line 7 column 7',
   )
 })
 
-test('extractLinksFromFile: invalid notebook (bad schema)', async (t) => {
+test('extractFromFile: invalid notebook (bad schema)', async (t) => {
   const error = await t.throwsAsync(
-    async () => await extractLinksFromFile('__test__/fixtures/invalid-notebook-schema.ipynb'),
+    async () => await extractFromFile('__test__/fixtures/invalid-notebook-schema.ipynb'),
   )
   t.is(error.name, 'Error')
   t.is(
diff --git a/index.d.ts b/index.d.ts
@@ -7,10 +7,18 @@
  */
 export declare function extractAnchors(markdown: string): Array<string>
 
+/**
+ * Extracts links and anchors from an MDX file or notebook containing MDX.
+ *
+ * Example:
+ * ```ts
+ * const [links, anchors] = await extractFromFile("notebook.ipynb");
+ * ```
+ */
+export declare function extractFromFile(filePath: string): Promise<[string[], string[]]>
+
 /**
  * Extract links from a markdown string. Supports GitHub-flavored markdown
  * (gfm), math, and JSX.
  */
 export declare function extractLinks(markdown: string): Array<string>
-
-export declare function extractLinksFromFile(filePath: string): Promise<Array<string>>
diff --git a/index.js b/index.js
@@ -573,5 +573,5 @@ if (!nativeBinding) {
 
 module.exports = nativeBinding
 module.exports.extractAnchors = nativeBinding.extractAnchors
+module.exports.extractFromFile = nativeBinding.extractFromFile
 module.exports.extractLinks = nativeBinding.extractLinks
-module.exports.extractLinksFromFile = nativeBinding.extractLinksFromFile
diff --git a/src/lib.rs b/src/lib.rs
@@ -1,43 +1,49 @@
-use markdown::mdast::{AttributeContent, AttributeValue, MdxJsxTextElement};
-use markdown::{mdast::Node, to_mdast, Constructs, ParseOptions};
 use napi::Error;
 use napi_derive::napi;
-use std::collections::HashSet;
 use tokio::fs;
 
+use crate::anchors::extract_anchors_from_ref;
 use crate::notebook::extract_markdown_from_notebook_source;
 
+mod anchors;
+mod links;
 mod notebook;
 
-fn file_read_error(path: String, reason: String) -> Result<Vec<String>, Error> {
+fn file_read_error(path: String, reason: String) -> Error {
   let message = format!("Could not read \"{path}\": {reason}");
-  Err(Error::from_reason(message))
+  Error::from_reason(message)
 }
 
-#[napi]
-pub async fn extract_links_from_file(file_path: String) -> Result<Vec<String>, Error> {
+/// Extracts links and anchors from an MDX file or notebook containing MDX.
+///
+/// Example:
+/// ```ts
+/// const [links, anchors] = await extractFromFile("notebook.ipynb");
+/// ```
+#[napi(ts_return_type = "Promise<[string[], string[]]>")]
+pub async fn extract_from_file(file_path: String) -> Result<Vec<Vec<String>>, Error> {
   let is_notebook = file_path.ends_with(".ipynb");
   let source = match fs::read_to_string(&file_path).await {
     Ok(s) => s,
-    Err(e) => return file_read_error(file_path, e.to_string()),
+    Err(e) => return Err(file_read_error(file_path, e.to_string())),
   };
 
   let markdown = if is_notebook {
     match extract_markdown_from_notebook_source(source) {
       Ok(md) => md,
-      Err(e) => return file_read_error(file_path, e.to_string()),
+      Err(e) => return Err(file_read_error(file_path, e.to_string())),
     }
   } else {
     source
   };
 
-  extract_links(markdown)
+  let anchors = extract_anchors_from_ref(&markdown);
+  match extract_links(markdown) {
+    Ok(links) => Ok(vec![links, anchors]),
+    Err(e) => Err(Error::from_reason(e.to_string())),
+  }
 }
 
-use crate::anchors::extract_anchors_from_ref;
-
-mod anchors;
-
 /// Extract anchors from a markdown string. Anchors are either:
 ///  * slugified headings, deduplicated if the same heading appears more than once
 ///  * `id` props of HTML tags. These are not deduplicated as they should be unique per file
@@ -50,62 +56,5 @@ pub fn extract_anchors(markdown: String) -> Vec<String> {
 /// (gfm), math, and JSX.
 #[napi]
 pub fn extract_links(markdown: String) -> Result<Vec<String>, Error> {
-  let options = ParseOptions {
-    constructs: Constructs {
-      gfm_autolink_literal: true,
-      gfm_footnote_definition: true,
-      gfm_label_start_footnote: true,
-      gfm_strikethrough: true,
-      gfm_table: true,
-      gfm_task_list_item: true,
-      math_flow: true,
-      math_text: true,
-      mdx_jsx_flow: true,
-      mdx_jsx_text: true,
-      ..Constructs::mdx()
-    },
-    ..ParseOptions::mdx()
-  };
-
-  let ast = match to_mdast(markdown.as_str(), &options) {
-    Ok(ast) => ast,
-    Err(m) => return Err(Error::from_reason(m.to_string())),
-  };
-
-  let mut links = HashSet::<&String>::default();
-  extract_from_node(&ast, &mut links);
-
-  Ok(links.into_iter().cloned().collect())
-}
-
-fn extract_from_node<'a>(node: &'a Node, links: &mut HashSet<&'a String>) {
-  let maybe_link = match node {
-    Node::Image(img) => Some(&img.url),
-    Node::Link(link) => Some(&link.url),
-    Node::MdxJsxTextElement(el) => extract_from_jsx_text_element(el),
-    _ => None,
-  };
-
-  if let Some(link) = maybe_link {
-    links.insert(link);
-  }
-
-  if let Some(children) = node.children() {
-    for child in children {
-      extract_from_node(child, links);
-    }
-  }
-}
-
-fn extract_from_jsx_text_element(el: &MdxJsxTextElement) -> Option<&String> {
-  let Some(Some(href_attr)) = el.attributes.iter().find_map(|attr| match attr {
-    AttributeContent::Property(p) if p.name == "href" => Some(&p.value),
-    _ => None,
-  }) else {
-    return None;
-  };
-  match href_attr {
-    AttributeValue::Literal(s) => Some(s),
-    _ => None,
-  }
+  links::extract_links(markdown)
 }
diff --git a/src/links/mod.rs b/src/links/mod.rs
@@ -0,0 +1,65 @@
+use markdown::mdast::{AttributeContent, AttributeValue, MdxJsxTextElement};
+use markdown::{mdast::Node, to_mdast, Constructs, ParseOptions};
+use napi::Error;
+use std::collections::HashSet;
+
+pub fn extract_links(markdown: String) -> Result<Vec<String>, Error> {
+  let options = ParseOptions {
+    constructs: Constructs {
+      gfm_autolink_literal: true,
+      gfm_footnote_definition: true,
+      gfm_label_start_footnote: true,
+      gfm_strikethrough: true,
+      gfm_table: true,
+      gfm_task_list_item: true,
+      math_flow: true,
+      math_text: true,
+      mdx_jsx_flow: true,
+      mdx_jsx_text: true,
+      ..Constructs::mdx()
+    },
+    ..ParseOptions::mdx()
+  };
+
+  let ast = match to_mdast(markdown.as_str(), &options) {
+    Ok(ast) => ast,
+    Err(m) => return Err(Error::from_reason(m.to_string())),
+  };
+
+  let mut links = HashSet::<&String>::default();
+  extract_from_node(&ast, &mut links);
+
+  Ok(links.into_iter().cloned().collect())
+}
+
+fn extract_from_node<'a>(node: &'a Node, links: &mut HashSet<&'a String>) {
+  let maybe_link = match node {
+    Node::Image(img) => Some(&img.url),
+    Node::Link(link) => Some(&link.url),
+    Node::MdxJsxTextElement(el) => extract_from_jsx_text_element(el),
+    _ => None,
+  };
+
+  if let Some(link) = maybe_link {
+    links.insert(link);
+  }
+
+  if let Some(children) = node.children() {
+    for child in children {
+      extract_from_node(child, links);
+    }
+  }
+}
+
+fn extract_from_jsx_text_element(el: &MdxJsxTextElement) -> Option<&String> {
+  let Some(Some(href_attr)) = el.attributes.iter().find_map(|attr| match attr {
+    AttributeContent::Property(p) if p.name == "href" => Some(&p.value),
+    _ => None,
+  }) else {
+    return None;
+  };
+  match href_attr {
+    AttributeValue::Literal(s) => Some(s),
+    _ => None,
+  }
+}