forked from 0xPlaygrounds/rig
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpdf_agent.rs
More file actions
108 lines (85 loc) · 3.23 KB
/
pdf_agent.rs
File metadata and controls
108 lines (85 loc) · 3.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
use anyhow::{Context, Result};
use rig::{
embeddings::EmbeddingsBuilder, loaders::PdfFileLoader, providers::openai,
vector_store::in_memory_store::InMemoryVectorStore, Embed,
};
use serde::{Deserialize, Serialize};
use std::path::PathBuf;
#[derive(Embed, Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
struct Document {
id: String,
#[embed]
content: String,
}
fn load_pdf(path: PathBuf) -> Result<Vec<String>> {
const CHUNK_SIZE: usize = 2000;
let content_chunks = PdfFileLoader::with_glob(path.to_str().context("Invalid path")?)?
.read()
.into_iter()
.filter_map(|result| {
result
.map_err(|e| {
eprintln!("Error reading PDF content: {}", e);
e
})
.ok()
})
.flat_map(|content| {
let mut chunks = Vec::new();
let mut current = String::new();
for word in content.split_whitespace() {
if current.len() + word.len() + 1 > CHUNK_SIZE && !current.is_empty() {
chunks.push(std::mem::take(&mut current).trim().to_string());
}
current.push_str(word);
current.push(' ');
}
if !current.is_empty() {
chunks.push(current.trim().to_string());
}
chunks
})
.collect::<Vec<_>>();
if content_chunks.is_empty() {
anyhow::bail!("No content found in PDF file: {}", path.display());
}
Ok(content_chunks)
}
#[tokio::main]
async fn main() -> Result<()> {
// Initialize Ollama client
let client = openai::Client::from_url("ollama", "http://localhost:11434/v1");
// Load PDFs using Rig's built-in PDF loader
let documents_dir = std::env::current_dir()?.join("rig-core/examples/documents");
let pdf_chunks =
load_pdf(documents_dir.join("deepseek_r1.pdf")).context("Failed to load pdf documents")?;
println!("Successfully loaded and chunked PDF documents");
// Create embedding model
let model = client.embedding_model("bge-m3");
// Create embeddings builder
let mut builder = EmbeddingsBuilder::new(model.clone());
// Add chunks from pdf documents
for (i, chunk) in pdf_chunks.into_iter().enumerate() {
builder = builder.document(Document {
id: format!("pdf_document_{}", i),
content: chunk,
})?;
}
// Build embeddings
let embeddings = builder.build().await?;
println!("Successfully generated embeddings");
// Create vector store and index
let vector_store = InMemoryVectorStore::from_documents(embeddings);
let index = vector_store.index(model);
println!("Successfully created vector store and index");
// Create RAG agent
let rag_agent = client
.agent("deepseek-r1")
.preamble("You are a helpful assistant that answers questions based on the provided document context. When answering questions, try to synthesize information from multiple chunks if they're related.")
.dynamic_context(1, index)
.build();
println!("Starting CLI chatbot...");
// Start interactive CLI
rig::cli_chatbot::cli_chatbot(rag_agent).await?;
Ok(())
}