feat: add corpus normalization feature

aaronlifton · aaronlifton · commit c542c8f84c25 · 2025-11-12T22:26:59.000-05:00
diff --git a/.gitignore b/.gitignore
@@ -1 +1,3 @@
-target
+target/
+personal_docs/
+data/
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -5,7 +5,7 @@ edition = "2021"
 
 [features]
 default = []
-multi_thread = ["scraper"]
+multi_thread = []
 
 [dependencies]
 heapless = { version = "0.9.1", default-features = false }
@@ -25,7 +25,10 @@ reqwest = { version = "0.12", default-features = false, features = [
   "stream",
 ] }
 lol_html = "2.7.0"
-scraper = { version = "0.24.0", optional = true }
+scraper = { version = "0.24.0" }
+crc32fast = "1.4"
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0"
 clap = { version = "4.5", features = ["derive", "env"] }
-url = "2.5"
+url = { version = "2.5", features = ["serde"] }
 clap_builder = "4.5.51"
diff --git a/README.md b/README.md
@@ -101,12 +101,34 @@ cargo run --example wiki --features multi_thread -- --duration-secs 60
 - Metrics live in `src/runtime.rs` and can be extended if you need additional counters or telemetry sinks. Multi-thread
   runs also report `local shard enqueues` vs `remote shard links (batches)` so you can gauge partition efficiency.
 
+## Corpus Normalization
+
+Pass `--normalize` to stream every fetched page through the new `Normalizer` service. The pipeline writes newline-
+delimited JSON (metadata + cleaned text blocks + embedding-ready chunks) to `--normalize-jsonl` (default:
+`normalized_pages.jsonl`) and respects additional knobs:
+
+```
+cargo run --example wiki -- \
+  --normalize \
+  --normalize-jsonl data/wiki.jsonl \
+  --normalize-manifest-jsonl data/wiki_manifest.jsonl \
+  --normalize-chunk-tokens 384 \
+  --normalize-overlap-tokens 64
+```
+
+Chunk and block bounds can be tuned via `--normalize-chunk-tokens`, `--normalize-overlap-tokens`, and
+`--normalize-max-blocks`. The JSON payload includes per-block heading context, content hashes, token estimates, and
+metadata such as HTTP status, language hints, and shard ownership so downstream embedding/indexing jobs can ingest it
+directly. When `--normalize-manifest-jsonl` is set, the runtime also appends digest records (`url`, `checksum`,
+`last_seen_epoch_ms`, `changed`) so incremental pipelines can diff and skip re-embedding unchanged pages.
+
 ## LLM-Oriented Next Steps
 
 Fastcrawl is already a solid content harvester for downstream ML pipelines. Future work aimed at LLM/RAG workflows
 includes:
 
-1. **Corpus normalization** – strip boilerplate, capture metadata, and chunk pages into consistent token windows.
+- [x] **Corpus normalization** – strip boilerplate, capture metadata, and chunk pages into consistent token windows.
+
 2. **Embedding pipeline** – push cleaned chunks through an embedding model and store vectors (pgvector/Qdrant/Milvus)
    with provenance.
 3. **Incremental refresh** – schedule revisits, diff pages, and update embeddings so the knowledge base stays current.
diff --git a/src/controls.rs b/src/controls.rs
@@ -1,8 +1,10 @@
 //! Crawl throttle and filtering controls shared across executors.
 
+use crate::normalizer::NormalizationConfig;
 use clap::Parser;
 #[cfg(feature = "multi_thread")]
 use clap::ValueEnum;
+use std::path::PathBuf;
 use std::time::Duration;
 
 /// Tunable knobs that bound crawl behavior.
@@ -91,6 +93,34 @@ pub struct Cli {
     #[arg(long, env = "FASTCRAWL_DOMAINS", default_value = "en.wikipedia.org")]
     pub allowed_domains: String,
 
+    /// Enable corpus normalization pipeline and JSONL output
+    #[arg(long, env = "FASTCRAWL_NORMALIZE", default_value_t = false)]
+    pub normalize: bool,
+
+    /// Output path for normalized JSONL batches (overwrites existing file)
+    #[arg(
+        long,
+        env = "FASTCRAWL_NORMALIZE_JSONL",
+        default_value = "normalized_pages.jsonl"
+    )]
+    pub normalize_jsonl: String,
+
+    /// Optional manifest JSONL capturing per-URL digests (checksum + last seen)
+    #[arg(long, env = "FASTCRAWL_NORMALIZE_MANIFEST")]
+    pub normalize_manifest_jsonl: Option<PathBuf>,
+
+    /// Target tokens per chunk emitted by the normalizer
+    #[arg(long, env = "FASTCRAWL_NORMALIZE_TOKENS", default_value_t = 256)]
+    pub normalize_chunk_tokens: usize,
+
+    /// Token overlap between neighboring chunks
+    #[arg(long, env = "FASTCRAWL_NORMALIZE_OVERLAP", default_value_t = 48)]
+    pub normalize_overlap_tokens: usize,
+
+    /// Maximum text blocks to keep before truncating normalization
+    #[arg(long, env = "FASTCRAWL_NORMALIZE_MAX_BLOCKS", default_value_t = 8192)]
+    pub normalize_max_blocks: usize,
+
     /// Shard partitioning strategy (multi-thread feature only)
     #[cfg(feature = "multi_thread")]
     #[arg(long, env = "FASTCRAWL_PARTITION", default_value = "hash")]
@@ -133,6 +163,29 @@ impl Cli {
         Duration::from_secs(self.duration_secs)
     }
 
+    /// Returns normalization settings when enabled.
+    pub fn normalization_settings(&self) -> Option<NormalizationSettings> {
+        if !self.normalize {
+            return None;
+        }
+
+        let chunk_target = self.normalize_chunk_tokens.max(1);
+        let chunk_overlap = self
+            .normalize_overlap_tokens
+            .min(chunk_target.saturating_sub(1));
+        let max_blocks = self.normalize_max_blocks.max(1);
+
+        Some(NormalizationSettings {
+            output_path: PathBuf::from(&self.normalize_jsonl),
+            manifest_path: self.normalize_manifest_jsonl.clone(),
+            config: NormalizationConfig {
+                chunk_target_tokens: chunk_target,
+                chunk_overlap_tokens: chunk_overlap,
+                max_blocks,
+            },
+        })
+    }
+
     fn domains_vec(&self) -> Vec<String> {
         self.allowed_domains
             .split(',')
@@ -179,3 +232,14 @@ pub struct PartitionSettings {
     /// Whether to log channel-closed warnings for cross-shard sends.
     pub remote_channel_logs: bool,
 }
+
+/// Settings controlling corpus normalization outputs.
+#[derive(Debug, Clone)]
+pub struct NormalizationSettings {
+    /// Filesystem path that will receive newline-delimited JSON.
+    pub output_path: PathBuf,
+    /// Optional path for digest manifest records.
+    pub manifest_path: Option<PathBuf>,
+    /// Chunking/cleanup configuration applied to each page.
+    pub config: NormalizationConfig,
+}
diff --git a/src/html.rs b/src/html.rs
@@ -7,21 +7,34 @@ use std::error::Error;
 use std::fmt;
 use std::sync::{Arc, Mutex};
 
+/// Result of streaming link extraction.
+pub struct LinkHarvest<T> {
+    /// Accepted links transformed by the caller.
+    pub links: Vec<T>,
+    /// Optional raw body bytes captured during streaming.
+    pub body: Option<Vec<u8>>,
+}
+
 /// Streams anchor tags from an HTTP response, transforming matching `href` values with `transform`.
 ///
 /// The `transform` closure runs for every `href`; returning `Some(T)` keeps the value, `None` skips
-/// it. Only accepted entries count against `limit`.
+/// it. Only accepted entries count against `limit`. When `capture_body` is true the full response
+/// body is buffered and returned alongside the discovered links.
 pub async fn stream_links<T, F>(
     response: Response,
     limit: usize,
+    capture_body: bool,
     transform: F,
-) -> Result<Vec<T>, HtmlStreamError>
+) -> Result<LinkHarvest<T>, HtmlStreamError>
 where
     T: Send + 'static,
     F: Fn(&str) -> Option<T> + Send + Sync + 'static,
 {
-    if limit == 0 {
-        return Ok(Vec::new());
+    if limit == 0 && !capture_body {
+        return Ok(LinkHarvest {
+            links: Vec::new(),
+            body: None,
+        });
     }
 
     let values: Arc<Mutex<Vec<T>>> = Arc::new(Mutex::new(Vec::new()));
@@ -54,8 +67,12 @@ where
     );
 
     let mut stream = response.bytes_stream();
+    let mut body_buf = capture_body.then(Vec::new);
     while let Some(chunk) = stream.next().await {
         let chunk = chunk.map_err(HtmlStreamError::Http)?;
+        if let Some(buf) = body_buf.as_mut() {
+            buf.extend_from_slice(&chunk);
+        }
         rewriter.write(&chunk).map_err(HtmlStreamError::Rewrite)?;
     }
     rewriter.end().map_err(HtmlStreamError::Rewrite)?;
@@ -67,7 +84,10 @@ where
         .into_inner()
         .map_err(|_| HtmlStreamError::CollectorPoisoned)?;
 
-    Ok(collected)
+    Ok(LinkHarvest {
+        links: collected,
+        body: body_buf,
+    })
 }
 
 /// Errors surfaced while streaming HTML.
diff --git a/src/lib.rs b/src/lib.rs
@@ -6,9 +6,14 @@ mod bloom;
 pub mod controls;
 pub mod frontier;
 pub mod html;
+pub mod normalizer;
 pub mod runtime;
 
 pub use agents::{registry, AgentRegistry, CrawlTask, InlineString};
 pub use controls::{Cli, CrawlControls};
 pub use frontier::{Frontier, FrontierError, DEFAULT_FRONTIER_QUEUE, DEFAULT_FRONTIER_SEEN};
+pub use normalizer::{
+    BlockKind, FetchedPage, NormalizationConfig, NormalizationError, NormalizedChunk,
+    NormalizedPage, Normalizer, PageMetadata, SectionHeading, TextBlock,
+};
 pub use runtime::run as run_crawler;
diff --git a/src/normalizer.rs b/src/normalizer.rs
diff --git a/src/runtime.rs b/src/runtime.rs

-Original file line number
+Diff line change
@@ @@ -1 +1,3 @@ @@
 -target
 +target/
 +personal_docs/
 +data/