@@ -22,8 +22,8 @@ struct Cli {
2222enum Commands {
2323 /// Index dataset into ZeroEntropy collections
2424 Index {
25- /// Number of documents to index per collection
26- #[ arg( short, long, default_value = "100 " ) ]
25+ /// Number of documents to index per collection (0 = all documents)
26+ #[ arg( short, long, default_value = "0 " ) ]
2727 limit : usize ,
2828
2929 /// Collections to index (comma-separated: webtext,gpt2_small,gpt2_medium,gpt2_large,gpt2_xl)
@@ -53,6 +53,17 @@ enum Commands {
5353 Interactive ,
5454}
5555
56+ fn expand_tilde ( path : & Path ) -> PathBuf {
57+ if let Some ( path_str) = path. to_str ( ) {
58+ if path_str. starts_with ( "~" ) {
59+ if let Some ( home) = std:: env:: var ( "USERPROFILE" ) . ok ( ) . or_else ( || std:: env:: var ( "HOME" ) . ok ( ) ) {
60+ return PathBuf :: from ( path_str. replacen ( "~" , & home, 1 ) ) ;
61+ }
62+ }
63+ }
64+ path. to_path_buf ( )
65+ }
66+
5667#[ tokio:: main]
5768async fn main ( ) -> Result < ( ) , Box < dyn std:: error:: Error > > {
5869 // Load .env file if it exists
@@ -63,18 +74,9 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
6374 // Create client from ZEROENTROPY_API_KEY environment variable
6475 let client = Client :: from_env ( ) ?;
6576
66- // Dataset path
67- let dataset_path = & cli. dataset ;
77+ // Dataset path - expand tilde if present
78+ let dataset_path = expand_tilde ( & cli. dataset ) ;
6879
69- // Collections to search
70- let collections = vec ! [
71- ( "webtext" , "webtext.valid.jsonl" ) ,
72- ( "gpt2_small" , "small-117M.valid.jsonl" ) ,
73- ( "gpt2_medium" , "medium-345M.valid.jsonl" ) ,
74- ( "gpt2_large" , "large-762M.valid.jsonl" ) ,
75- ( "gpt2_xl" , "xl-1542M.valid.jsonl" ) ,
76- ] ;
77-
7880 // Available collections
7981 let all_collections = vec ! [
8082 ( "webtext" , "webtext.valid.jsonl" ) ,
@@ -87,7 +89,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
8789 match cli. command {
8890 Commands :: Index { limit, collections : selected } => {
8991 let collections_to_index = filter_collections ( & all_collections, selected) ;
90- index_collections ( & client, dataset_path, & collections_to_index, limit) . await ?;
92+ index_collections ( & client, & dataset_path, & collections_to_index, limit) . await ?;
9193 }
9294 Commands :: Search { query, limit, collections : selected } => {
9395 let collections_to_search = filter_collections ( & all_collections, selected) ;
@@ -128,35 +130,43 @@ async fn index_collections(
128130 println ! ( "{}" , "=" . repeat( 60 ) ) ;
129131 println ! ( "Indexing GPT-2 Dataset" ) ;
130132 println ! ( "{}" , "=" . repeat( 60 ) ) ;
131- println ! ( "Limit: {} documents per collection" , limit) ;
133+ if limit == 0 {
134+ println ! ( "Indexing all documents" ) ;
135+ } else {
136+ println ! ( "Limit: {} documents per collection" , limit) ;
137+ }
132138 println ! ( ) ;
133139 for ( collection_name, filename) in collections {
134- println ! ( "\n 📂 Processing {}..." , collection_name) ;
140+ println ! ( "\n Processing {}..." , collection_name) ;
135141
136142 // Create collection
137143 match client. collections ( ) . add ( * collection_name) . await {
138- Ok ( response) => println ! ( " ✓ {}" , response. message) ,
144+ Ok ( response) => println ! ( " {}" , response. message) ,
139145 Err ( zeroentropy_community:: Error :: Conflict ( _) ) => {
140- println ! ( " ℹ Collection already exists" ) ;
146+ println ! ( " Collection already exists" ) ;
141147 }
142148 Err ( e) => return Err ( e. into ( ) ) ,
143149 }
144150
145151 // Load and index samples
146152 let file_path = dataset_path. join ( filename) ;
147153 if !file_path. exists ( ) {
148- println ! ( " ⚠️ File not found: {}" , file_path. display( ) ) ;
154+ println ! ( " File not found: {}" , file_path. display( ) ) ;
149155 continue ;
150156 }
151157
152158 let file = File :: open ( & file_path) ?;
153159 let reader = BufReader :: new ( file) ;
154160
155- println ! ( " 📊 Indexing up to {} samples..." , limit) ;
161+ if limit == 0 {
162+ println ! ( " Indexing all samples..." ) ;
163+ } else {
164+ println ! ( " Indexing up to {} samples..." , limit) ;
165+ }
156166 let mut count = 0 ;
157167
158168 for ( idx, line) in reader. lines ( ) . enumerate ( ) {
159- if idx >= limit {
169+ if limit > 0 && idx >= limit {
160170 break ;
161171 }
162172
@@ -183,7 +193,11 @@ async fn index_collections(
183193 Some ( metadata) ,
184194 ) . await {
185195 Ok ( _) => count += 1 ,
186- Err ( e) => eprintln ! ( " ⚠️ Error adding document {}: {}" , idx, e) ,
196+ Err ( zeroentropy_community:: Error :: Conflict ( _) ) => {
197+ // Document already exists, skip silently
198+ continue ;
199+ }
200+ Err ( e) => eprintln ! ( " Error adding document {}: {}" , idx, e) ,
187201 }
188202
189203 if count % 10 == 0 {
@@ -194,7 +208,7 @@ async fn index_collections(
194208 }
195209 }
196210
197- println ! ( "\n ✓ Indexed {} documents from {}" , count, collection_name) ;
211+ println ! ( "\n Indexed {} documents from {}" , count, collection_name) ;
198212 }
199213
200214 Ok ( ( ) )
@@ -222,7 +236,7 @@ async fn code_search(
222236 ] ;
223237
224238 for query in code_queries {
225- println ! ( "\n 🔍 Searching for: \" {}\" " , query) ;
239+ println ! ( "\n Searching for: \" {}\" " , query) ;
226240 println ! ( "{}" , "-" . repeat( 60 ) ) ;
227241
228242 // Search each collection
@@ -238,13 +252,13 @@ async fn code_search(
238252 ) . await {
239253 Ok ( r) => r,
240254 Err ( e) => {
241- println ! ( " ⚠️ Error searching {}: {}" , collection_name, e) ;
255+ println ! ( " Error searching {}: {}" , collection_name, e) ;
242256 continue ;
243257 }
244258 } ;
245259
246260 if !results. results . is_empty ( ) {
247- println ! ( "\n 📊 {} ({} results):" , collection_name, results. results. len( ) ) ;
261+ println ! ( "\n {} ({} results):" , collection_name, results. results. len( ) ) ;
248262
249263 for ( i, result) in results. results . iter ( ) . take ( 2 ) . enumerate ( ) {
250264 println ! ( "\n {}. {} (score: {:.4})" , i + 1 , result. path, result. score) ;
@@ -289,13 +303,13 @@ async fn search_collections(
289303 ) . await {
290304 Ok ( r) => r,
291305 Err ( e) => {
292- println ! ( "⚠️ Error searching {}: {}" , collection_name, e) ;
306+ println ! ( "Error searching {}: {}" , collection_name, e) ;
293307 continue ;
294308 }
295309 } ;
296310
297311 if !results. results . is_empty ( ) {
298- println ! ( "📊 {} - Found {} matches:" , collection_name, results. results. len( ) ) ;
312+ println ! ( "{} - Found {} matches:" , collection_name, results. results. len( ) ) ;
299313
300314 for ( i, result) in results. results . iter ( ) . enumerate ( ) {
301315 println ! ( "\n {}. {} (score: {:.4})" , i + 1 , result. path, result. score) ;
@@ -336,7 +350,7 @@ async fn interactive_search(
336350 break ;
337351 }
338352
339- println ! ( "\n 🔍 Searching all collections for: \" {}\" " , query) ;
353+ println ! ( "\n Searching all collections for: \" {}\" " , query) ;
340354 println ! ( "{}" , "-" . repeat( 60 ) ) ;
341355
342356 // Search all collections
@@ -352,13 +366,13 @@ async fn interactive_search(
352366 ) . await {
353367 Ok ( r) => r,
354368 Err ( e) => {
355- println ! ( " ⚠️ Error: {}" , e) ;
369+ println ! ( " Error: {}" , e) ;
356370 continue ;
357371 }
358372 } ;
359373
360374 if !results. results . is_empty ( ) {
361- println ! ( "\n 📊 {} - Found {} matches:" , collection_name, results. results. len( ) ) ;
375+ println ! ( "\n {} - Found {} matches:" , collection_name, results. results. len( ) ) ;
362376
363377 for ( i, result) in results. results . iter ( ) . take ( 3 ) . enumerate ( ) {
364378 println ! ( "\n {}. {} (score: {:.4})" , i + 1 , result. path, result. score) ;
0 commit comments