Skip to content

Commit 5b8dda6

Browse files
committed
moved projects, added a project for zeroentropy on GPT-2
1 parent b9da519 commit 5b8dda6

File tree

10 files changed

+58
-133
lines changed

10 files changed

+58
-133
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,6 @@ docs/
3333

3434
# Future integration notes (internal planning)
3535
future-integrations/
36+
.claude
37+
.github
38+
.env

README.md

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -306,10 +306,19 @@ match client.collections().add("my_collection").await {
306306

307307
## Examples
308308

309-
Check out the [examples](examples/) directory for more complete examples:
309+
Check out the [examples](examples/) directory for complete examples:
310310

311+
### Basic Examples
311312
- [basic.rs](examples/basic.rs) - Complete workflow from collection creation to search
312313
- [arxiv_search.rs](examples/arxiv_search.rs) - Download and search arXiv papers with PDF support
314+
- [ehr_search.rs](examples/ehr_search.rs) - Electronic health record search example
315+
316+
### Advanced Examples
317+
- [search_gpt2_dataset.rs](examples/search_gpt2_dataset.rs) - Search through GPT-2 dataset
318+
- [emergence_tester.rs](examples/emergence_tester.rs) - Test emergence patterns in semantic search
319+
- [phoneme_to_word_bci.rs](examples/phoneme_to_word_bci.rs) - Brain-computer interface phoneme matching
320+
- [phoneme_to_word_advanced.rs](examples/phoneme_to_word_advanced.rs) - Advanced phoneme matching
321+
- [phoneme_to_word_full_dataset.rs](examples/phoneme_to_word_full_dataset.rs) - Full dataset phoneme analysis
313322

314323
Run an example:
315324

SESSION_SUMMARY.md

Lines changed: 0 additions & 101 deletions
This file was deleted.
File renamed without changes.
File renamed without changes.

examples/search_gpt2_dataset.rs

Lines changed: 45 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@ struct Cli {
2222
enum Commands {
2323
/// Index dataset into ZeroEntropy collections
2424
Index {
25-
/// Number of documents to index per collection
26-
#[arg(short, long, default_value = "100")]
25+
/// Number of documents to index per collection (0 = all documents)
26+
#[arg(short, long, default_value = "0")]
2727
limit: usize,
2828

2929
/// Collections to index (comma-separated: webtext,gpt2_small,gpt2_medium,gpt2_large,gpt2_xl)
@@ -53,6 +53,17 @@ enum Commands {
5353
Interactive,
5454
}
5555

56+
fn expand_tilde(path: &Path) -> PathBuf {
57+
if let Some(path_str) = path.to_str() {
58+
if path_str.starts_with("~") {
59+
if let Some(home) = std::env::var("USERPROFILE").ok().or_else(|| std::env::var("HOME").ok()) {
60+
return PathBuf::from(path_str.replacen("~", &home, 1));
61+
}
62+
}
63+
}
64+
path.to_path_buf()
65+
}
66+
5667
#[tokio::main]
5768
async fn main() -> Result<(), Box<dyn std::error::Error>> {
5869
// Load .env file if it exists
@@ -63,18 +74,9 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
6374
// Create client from ZEROENTROPY_API_KEY environment variable
6475
let client = Client::from_env()?;
6576

66-
// Dataset path
67-
let dataset_path = &cli.dataset;
77+
// Dataset path - expand tilde if present
78+
let dataset_path = expand_tilde(&cli.dataset);
6879

69-
// Collections to search
70-
let collections = vec![
71-
("webtext", "webtext.valid.jsonl"),
72-
("gpt2_small", "small-117M.valid.jsonl"),
73-
("gpt2_medium", "medium-345M.valid.jsonl"),
74-
("gpt2_large", "large-762M.valid.jsonl"),
75-
("gpt2_xl", "xl-1542M.valid.jsonl"),
76-
];
77-
7880
// Available collections
7981
let all_collections = vec![
8082
("webtext", "webtext.valid.jsonl"),
@@ -87,7 +89,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
8789
match cli.command {
8890
Commands::Index { limit, collections: selected } => {
8991
let collections_to_index = filter_collections(&all_collections, selected);
90-
index_collections(&client, dataset_path, &collections_to_index, limit).await?;
92+
index_collections(&client, &dataset_path, &collections_to_index, limit).await?;
9193
}
9294
Commands::Search { query, limit, collections: selected } => {
9395
let collections_to_search = filter_collections(&all_collections, selected);
@@ -128,35 +130,43 @@ async fn index_collections(
128130
println!("{}", "=".repeat(60));
129131
println!("Indexing GPT-2 Dataset");
130132
println!("{}", "=".repeat(60));
131-
println!("Limit: {} documents per collection", limit);
133+
if limit == 0 {
134+
println!("Indexing all documents");
135+
} else {
136+
println!("Limit: {} documents per collection", limit);
137+
}
132138
println!();
133139
for (collection_name, filename) in collections {
134-
println!("\n📂 Processing {}...", collection_name);
140+
println!("\nProcessing {}...", collection_name);
135141

136142
// Create collection
137143
match client.collections().add(*collection_name).await {
138-
Ok(response) => println!(" {}", response.message),
144+
Ok(response) => println!(" {}", response.message),
139145
Err(zeroentropy_community::Error::Conflict(_)) => {
140-
println!(" Collection already exists");
146+
println!(" Collection already exists");
141147
}
142148
Err(e) => return Err(e.into()),
143149
}
144150

145151
// Load and index samples
146152
let file_path = dataset_path.join(filename);
147153
if !file_path.exists() {
148-
println!(" ⚠️ File not found: {}", file_path.display());
154+
println!(" File not found: {}", file_path.display());
149155
continue;
150156
}
151157

152158
let file = File::open(&file_path)?;
153159
let reader = BufReader::new(file);
154160

155-
println!(" 📊 Indexing up to {} samples...", limit);
161+
if limit == 0 {
162+
println!(" Indexing all samples...");
163+
} else {
164+
println!(" Indexing up to {} samples...", limit);
165+
}
156166
let mut count = 0;
157167

158168
for (idx, line) in reader.lines().enumerate() {
159-
if idx >= limit {
169+
if limit > 0 && idx >= limit {
160170
break;
161171
}
162172

@@ -183,7 +193,11 @@ async fn index_collections(
183193
Some(metadata),
184194
).await {
185195
Ok(_) => count += 1,
186-
Err(e) => eprintln!(" ⚠️ Error adding document {}: {}", idx, e),
196+
Err(zeroentropy_community::Error::Conflict(_)) => {
197+
// Document already exists, skip silently
198+
continue;
199+
}
200+
Err(e) => eprintln!(" Error adding document {}: {}", idx, e),
187201
}
188202

189203
if count % 10 == 0 {
@@ -194,7 +208,7 @@ async fn index_collections(
194208
}
195209
}
196210

197-
println!("\n Indexed {} documents from {}", count, collection_name);
211+
println!("\n Indexed {} documents from {}", count, collection_name);
198212
}
199213

200214
Ok(())
@@ -222,7 +236,7 @@ async fn code_search(
222236
];
223237

224238
for query in code_queries {
225-
println!("\n🔍 Searching for: \"{}\"", query);
239+
println!("\nSearching for: \"{}\"", query);
226240
println!("{}", "-".repeat(60));
227241

228242
// Search each collection
@@ -238,13 +252,13 @@ async fn code_search(
238252
).await {
239253
Ok(r) => r,
240254
Err(e) => {
241-
println!(" ⚠️ Error searching {}: {}", collection_name, e);
255+
println!(" Error searching {}: {}", collection_name, e);
242256
continue;
243257
}
244258
};
245259

246260
if !results.results.is_empty() {
247-
println!("\n 📊 {} ({} results):", collection_name, results.results.len());
261+
println!("\n {} ({} results):", collection_name, results.results.len());
248262

249263
for (i, result) in results.results.iter().take(2).enumerate() {
250264
println!("\n {}. {} (score: {:.4})", i + 1, result.path, result.score);
@@ -289,13 +303,13 @@ async fn search_collections(
289303
).await {
290304
Ok(r) => r,
291305
Err(e) => {
292-
println!("⚠️ Error searching {}: {}", collection_name, e);
306+
println!("Error searching {}: {}", collection_name, e);
293307
continue;
294308
}
295309
};
296310

297311
if !results.results.is_empty() {
298-
println!("📊 {} - Found {} matches:", collection_name, results.results.len());
312+
println!("{} - Found {} matches:", collection_name, results.results.len());
299313

300314
for (i, result) in results.results.iter().enumerate() {
301315
println!("\n {}. {} (score: {:.4})", i + 1, result.path, result.score);
@@ -336,7 +350,7 @@ async fn interactive_search(
336350
break;
337351
}
338352

339-
println!("\n🔍 Searching all collections for: \"{}\"", query);
353+
println!("\nSearching all collections for: \"{}\"", query);
340354
println!("{}", "-".repeat(60));
341355

342356
// Search all collections
@@ -352,13 +366,13 @@ async fn interactive_search(
352366
).await {
353367
Ok(r) => r,
354368
Err(e) => {
355-
println!(" ⚠️ Error: {}", e);
369+
println!(" Error: {}", e);
356370
continue;
357371
}
358372
};
359373

360374
if !results.results.is_empty() {
361-
println!("\n 📊 {} - Found {} matches:", collection_name, results.results.len());
375+
println!("\n {} - Found {} matches:", collection_name, results.results.len());
362376

363377
for (i, result) in results.results.iter().take(3).enumerate() {
364378
println!("\n {}. {} (score: {:.4})", i + 1, result.path, result.score);

0 commit comments

Comments
 (0)