Add basic netlify redirects support

untitaker · untitaker · commit 4931ae5d68da · 2025-11-23T17:33:11.000+01:00
diff --git a/README.md b/README.md
@@ -184,6 +184,36 @@ and `--github-actions` feature.
   fairly feature-rich, but was a non-starter due to performance. This applies
   to other countless link checkers we tried that are not mentioned here.
 
+## Redirects
+
+Since 0.1.45 `hyperlink` supports a very small subset of Netlify's `_redirects`
+file. Redirect sources will be considered when validating `href`s, and redirect
+targets will be checked for validity as well.
+
+At the root of your site, make a file `_redirects`:
+
+```
+# lines starting with # are ignored
+/old-url.html /new-url.html
+
+# on the next line, trailing data like the 301 status code is ignored
+/old-url2.html /new-url2.html  301
+
+# /old-url.html will become a valid link target
+# hyperlink will validate that /new-url.html exists.
+```
+
+The major things missing from the implementation are:
+
+* `hyperlink` completely ignores any status codes or country code conditions.
+  The only thing it parses are `from to`, and the rest is ignored.
+
+* "Splat sources" (`/articles/*`) and "splat targets" (`/posts/:splat`) are not
+  supported.
+
+* Generally speaking, `hyperlink` does not support "pretty URLs", i.e. one
+  cannot request `/mypage` and expect `mypage.html` to be loaded.
+
 ## Testimonials
 
 > We use Hyperlink to check for dead links on
diff --git a/src/html/mod.rs b/src/html/mod.rs
@@ -3,7 +3,7 @@ mod parser;
 use std::borrow::Cow;
 use std::fmt;
 use std::fs;
-use std::io::Read;
+use std::io::{BufRead, BufReader, Read};
 use std::path::{Path, PathBuf};
 use std::str;
 use std::sync::Arc;
@@ -308,6 +308,41 @@ impl Document {
         Href(href.into_bump_str())
     }
 
+    pub fn extract_links<'b, 'l, P: ParagraphWalker, F>(
+        &self,
+        doc_buf: &'b mut DocumentBuffers,
+        check_anchors: bool,
+        mut callback: F,
+    ) -> Result<bool, Error>
+    where
+        'b: 'l,
+        F: FnMut(Link<'l, P::Paragraph>),
+    {
+        if self.path.file_name().and_then(|f| f.to_str()) == Some("_redirects") {
+            for link in self.parse_redirects::<P>(doc_buf, check_anchors)? {
+                callback(link);
+            }
+            return Ok(true);
+        }
+
+        if self
+            .path
+            .extension()
+            .and_then(|extension| {
+                let ext = extension.to_str()?;
+                Some(ext == "html" || ext == "htm")
+            })
+            .unwrap_or(false)
+        {
+            for link in self.links_from_html::<P>(doc_buf, check_anchors)? {
+                callback(link);
+            }
+            return Ok(true);
+        }
+
+        Ok(false)
+    }
+
     pub fn links<'b, 'l, P: ParagraphWalker>(
         &self,
         doc_buf: &'b mut DocumentBuffers,
@@ -319,6 +354,62 @@ impl Document {
         self.links_from_read::<_, P>(doc_buf, fs::File::open(&*self.path)?, check_anchors)
     }
 
+    fn links_from_html<'b, 'l, P: ParagraphWalker>(
+        &self,
+        doc_buf: &'b mut DocumentBuffers,
+        check_anchors: bool,
+    ) -> Result<impl Iterator<Item = Link<'l, P::Paragraph>>, Error>
+    where
+        'b: 'l,
+    {
+        self.links_from_read::<_, P>(doc_buf, fs::File::open(&*self.path)?, check_anchors)
+    }
+
+    fn parse_redirects<'b, 'l, P: ParagraphWalker>(
+        &self,
+        doc_buf: &'b mut DocumentBuffers,
+        check_anchors: bool,
+    ) -> Result<impl Iterator<Item = Link<'l, P::Paragraph>>, Error>
+    where
+        'b: 'l,
+    {
+        let mut link_buf = BumpVec::new_in(&doc_buf.arena);
+        let file = fs::File::open(&*self.path)?;
+        let reader = BufReader::new(file);
+
+        for line in reader.lines() {
+            let line = line?;
+
+            let trimmed = line.trim();
+            if trimmed.is_empty() || trimmed.starts_with('#') {
+                continue;
+            }
+
+            let parts: Vec<&str> = trimmed.split_whitespace().collect();
+            if parts.len() >= 2 {
+                let source = parts[0];
+                let target = parts[1];
+
+                let source_str = doc_buf.arena.alloc_str(source);
+                let target_str = doc_buf.arena.alloc_str(target);
+
+                link_buf.push(Link::Defines(DefinedLink {
+                    href: self.join(&doc_buf.arena, check_anchors, source_str),
+                }));
+
+                if !is_external_link(target.as_bytes()) {
+                    link_buf.push(Link::Uses(UsedLink {
+                        href: self.join(&doc_buf.arena, check_anchors, target_str),
+                        path: self.path.clone(),
+                        paragraph: None,
+                    }));
+                }
+            }
+        }
+
+        Ok(link_buf.into_iter())
+    }
+
     fn links_from_read<'b, 'l, R: Read, P: ParagraphWalker>(
         &self,
         doc_buf: &'b mut DocumentBuffers,
diff --git a/src/main.rs b/src/main.rs
@@ -468,26 +468,17 @@ fn extract_html_links<C: LinkCollector<P::Paragraph>, P: ParagraphWalker>(
                 }));
                 file_count += 1;
 
-                if !document
-                    .path
-                    .extension()
-                    .and_then(|extension| Some(HTML_FILES.contains(&extension.to_str()?)))
-                    .unwrap_or(false)
-                {
-                    return Ok((doc_buf, collector, documents_count, file_count));
+                let was_parsed = document
+                    .extract_links::<P, _>(&mut doc_buf, check_anchors, |link| {
+                        collector.ingest(link);
+                    })
+                    .with_context(|| format!("Failed to read file {}", document.path.display()))?;
+
+                if was_parsed {
+                    doc_buf.reset();
+                    documents_count += 1;
                 }
 
-                for link in document
-                    .links::<P>(&mut doc_buf, check_anchors)
-                    .with_context(|| format!("Failed to read file {}", document.path.display()))?
-                {
-                    collector.ingest(link);
-                }
-
-                doc_buf.reset();
-
-                documents_count += 1;
-
                 Ok((doc_buf, collector, documents_count, file_count))
             },
         )
diff --git a/tests/cli.rs b/tests/cli.rs
@@ -65,3 +65,44 @@ fn test_bad_dir() {
             "Error: IO error for operation on non_existing_dir:",
         ));
 }
+
+#[test]
+fn test_redirects() {
+    let site = assert_fs::TempDir::new().unwrap();
+
+    site.child("_redirects")
+        .write_str(
+            "# This is a comment\n\
+             \n\
+             /old-page /new-page.html 301\n\
+             /external https://example.com/page\n\
+             /broken /missing-page.html\n\
+             /another /target.html",
+        )
+        .unwrap();
+
+    site.child("new-page.html").touch().unwrap();
+    site.child("target.html").touch().unwrap();
+
+    site.child("index.html")
+        .write_str("<a href='/old-page'>link</a>")
+        .unwrap();
+
+    let mut cmd = Command::cargo_bin("hyperlink").unwrap();
+    cmd.current_dir(site.path()).arg(".");
+
+    cmd.assert().failure().code(1).stdout(
+        predicate::str::is_match(
+            r#"^Reading files
+Checking 4 links from 4 files \(4 documents\)
+\./.*_redirects
+  error: bad link /missing-page\.html
+
+Found 1 bad links
+"#,
+        )
+        .unwrap(),
+    );
+
+    site.close().unwrap();
+}