feat: add docker-compose.yml; embed chunks in pgvector store

aaronlifton · aaronlifton · commit 5130bf3ea0f3 · 2025-11-14T22:31:15.000-05:00
diff --git a/README.md b/README.md
@@ -175,18 +175,37 @@ false, so re-embedding only happens when the crawler observed fresh content.
 ## pgvector Store
 
 Ship the embeddings into Postgres with the bundled `fastcrawl-pgvector` binary. It ingests the JSONL produced above and
-upserts into a `vector` table (creating the `vector` extension/table automatically unless disabled):
+upserts into a `vector` table (creating the `vector` extension/table automatically unless disabled). The repo now ships
+a `docker-compose.yml` that launches a local Postgres instance with the `pgvector` extension preinstalled:
+
+```sh
+docker compose up -d pgvector
+```
+
+Once the container is healthy, point `DATABASE_URL` at it and run the loader:
+
+```fish
+set -gx DATABASE_URL postgres://postgres:postgres@localhost:5432/fastcrawl
+```
 
 ```sh
 export DATABASE_URL=postgres://postgres:postgres@localhost:5432/fastcrawl
+```
+
+```sh
+docker-compose up;
+
 cargo run --bin pgvector_store -- \
   --input data/wiki_embeddings.jsonl \
   --schema public \
   --table wiki_chunks \
   --batch-size 256 \
-  --upsert
+  --upsert \
+  --database-url=postgresql://postgres:postgres@localhost:5432
 ```
 
+Stop the container with `docker compose down` (pass `-v` to remove the persisted volume if you want a clean slate).
+
 Columns created by default:
 
 - `url TEXT`, `chunk_id BIGINT` primary key for provenance.
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,24 @@
+version: "3.9"
+
+services:
+  pgvector:
+    image: pgvector/pgvector:0.8.1-pg18-trixie
+    container_name: fastcrawl-pgvector
+    restart: unless-stopped
+    environment:
+      POSTGRES_DB: fastcrawl
+      POSTGRES_USER: postgres
+      POSTGRES_PASSWORD: postgres
+    ports:
+      - "5432:5432"
+    volumes:
+      - pgvector-data:/var/lib/postgresql
+    healthcheck:
+      test: ["CMD", "pg_isready", "-U", "postgres"]
+      interval: 5s
+      timeout: 3s
+      retries: 5
+
+volumes:
+  pgvector-data:
+    driver: local
diff --git a/src/bin/embedder.rs b/src/bin/embedder.rs
@@ -8,12 +8,8 @@ use std::time::Duration;
 use anyhow::{anyhow, Context, Result};
 use clap::Parser;
 use crossbeam_channel::{bounded, Receiver, RecvTimeoutError, Sender, TryRecvError};
+use fastcrawl::embedder::openai::OpenAiEmbedder;
 use fastcrawl::{EmbeddedChunkRecord, ManifestRecord, NormalizedPage};
-use reqwest::blocking::Client;
-use reqwest::header::{HeaderMap, HeaderValue, AUTHORIZATION, CONTENT_TYPE};
-use reqwest::StatusCode;
-use serde::Deserialize;
-use serde::Serialize;
 
 #[derive(Parser, Debug)]
 #[command(
@@ -456,144 +452,3 @@ struct EmbeddingBatchResult {
 }
 
 type EmbeddingResult = Result<EmbeddingBatchResult>;
-
-#[derive(Clone)]
-struct OpenAiEmbedder {
-    client: Client,
-    endpoint: String,
-    model: String,
-    dimensions: Option<usize>,
-    max_retries: usize,
-    batch_size: usize,
-}
-
-impl OpenAiEmbedder {
-    fn new(
-        api_key: String,
-        base_url: String,
-        model: String,
-        dimensions: Option<usize>,
-        timeout: Duration,
-        max_retries: usize,
-        batch_size: usize,
-    ) -> Result<Self> {
-        anyhow::ensure!(!api_key.trim().is_empty(), "missing OpenAI API key");
-        anyhow::ensure!(!model.trim().is_empty(), "missing OpenAI model name");
-        let mut headers = HeaderMap::new();
-        let auth = format!("Bearer {}", api_key.trim());
-        headers.insert(
-            AUTHORIZATION,
-            HeaderValue::from_str(&auth).context("invalid OpenAI API key")?,
-        );
-        headers.insert(CONTENT_TYPE, HeaderValue::from_static("application/json"));
-        let client = Client::builder()
-            .timeout(timeout)
-            .default_headers(headers)
-            .build()
-            .context("failed to build OpenAI HTTP client")?;
-        let endpoint = format!("{}/embeddings", base_url.trim_end_matches('/'));
-        Ok(Self {
-            client,
-            endpoint,
-            model,
-            dimensions,
-            max_retries,
-            batch_size,
-        })
-    }
-
-    fn embed_batch(&self, inputs: &[&str]) -> Result<Vec<Vec<f32>>> {
-        if inputs.is_empty() {
-            return Ok(Vec::new());
-        }
-        anyhow::ensure!(
-            inputs.len() <= self.batch_size,
-            "batch of {} exceeds configured max {}",
-            inputs.len(),
-            self.batch_size
-        );
-
-        let mut attempt = 0usize;
-        loop {
-            let request = EmbeddingRequest {
-                model: &self.model,
-                input: inputs,
-                dimensions: self.dimensions,
-            };
-            let response = self.client.post(&self.endpoint).json(&request).send();
-            match response {
-                Ok(resp) => {
-                    let status = resp.status();
-                    if status.is_success() {
-                        let mut parsed: EmbeddingResponse = resp
-                            .json()
-                            .context("failed to parse OpenAI embedding response")?;
-                        parsed.data.sort_by_key(|entry| entry.index);
-                        anyhow::ensure!(
-                            parsed.data.len() == inputs.len(),
-                            "OpenAI returned {} embeddings for {} inputs",
-                            parsed.data.len(),
-                            inputs.len()
-                        );
-                        return Ok(parsed
-                            .data
-                            .into_iter()
-                            .map(|entry| entry.embedding)
-                            .collect());
-                    }
-
-                    let body = resp
-                        .text()
-                        .unwrap_or_else(|_| "<body unavailable>".to_string());
-                    if self.should_retry(status) && attempt + 1 < self.max_retries {
-                        attempt += 1;
-                        thread::sleep(self.retry_backoff(attempt));
-                        continue;
-                    }
-                    anyhow::bail!("OpenAI embeddings request failed ({}): {}", status, body);
-                }
-                Err(err) => {
-                    if self.is_retryable_error(&err) && attempt + 1 < self.max_retries {
-                        attempt += 1;
-                        thread::sleep(self.retry_backoff(attempt));
-                        continue;
-                    }
-                    return Err(err.into());
-                }
-            }
-        }
-    }
-
-    fn should_retry(&self, status: StatusCode) -> bool {
-        status == StatusCode::TOO_MANY_REQUESTS || status.is_server_error()
-    }
-
-    fn is_retryable_error(&self, err: &reqwest::Error) -> bool {
-        err.is_timeout() || err.is_connect() || err.is_body() || err.is_request() || err.is_decode()
-    }
-
-    fn retry_backoff(&self, attempt: usize) -> Duration {
-        let capped = attempt.min(5) as u32;
-        Duration::from_millis(500 * (1 << capped))
-    }
-}
-
-#[derive(Serialize)]
-struct EmbeddingRequest<'a> {
-    model: &'a str,
-    #[serde(borrow)]
-    input: &'a [&'a str],
-    #[serde(skip_serializing_if = "Option::is_none")]
-    dimensions: Option<usize>,
-}
-
-#[derive(Debug, Deserialize)]
-struct EmbeddingResponse {
-    data: Vec<EmbeddingData>,
-}
-
-#[derive(Debug, Deserialize)]
-struct EmbeddingData {
-    embedding: Vec<f32>,
-    index: usize,
-}
diff --git a/src/bin/pgvector_store.rs b/src/bin/pgvector_store.rs
@@ -69,6 +69,7 @@ async fn main() -> Result<()> {
 
     let mut batch = Vec::with_capacity(batch_size);
     let Some(first_record) = next_record(&mut lines)? else {
+        println!("No embeddings to insert; nothing to do.");
         return Ok(());
     };
     anyhow::ensure!(
@@ -84,18 +85,28 @@ async fn main() -> Result<()> {
     }
 
     batch.push(first_record);
+    let mut total_inserted = 0usize;
     while let Some(record) = next_record(&mut lines)? {
         batch.push(record);
         if batch.len() >= batch_size {
             insert_batch(&mut client, &table, &batch, cli.upsert).await?;
+            total_inserted += batch.len();
             batch.clear();
         }
     }
 
     if !batch.is_empty() {
         insert_batch(&mut client, &table, &batch, cli.upsert).await?;
+        total_inserted += batch.len();
     }
 
+    println!(
+        "Successfully inserted {} record{} into {}.",
+        total_inserted,
+        if total_inserted == 1 { "" } else { "s" },
+        table.qualified()
+    );
+
     Ok(())
 }
 
diff --git a/src/embedder/mod.rs b/src/embedder/mod.rs
@@ -0,0 +1,3 @@
+//! Embedding client implementations shared by embedding pipelines.
+
+pub mod openai;
diff --git a/src/embedder/openai.rs b/src/embedder/openai.rs
diff --git a/src/lib.rs b/src/lib.rs

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+//! Embedding client implementations shared by embedding pipelines.`
	`2`	`+`
	`3`	`+pub mod openai;`