@@ -2,7 +2,6 @@ package main
22
33import (
44 "context"
5- "errors"
65 "fmt"
76 "iter"
87 "log/slog"
@@ -19,7 +18,7 @@ import (
1918)
2019
2120// CohereWikipediaEmbeddings provides template data from Cohere's Wikipedia embeddings.
22- // See: https://huggingface.co/datasets/Cohere/wikipedia-22-12-en-embeddings
21+ // See: https://huggingface.co/datasets/Cohere/wikipedia-2023-11-embed-multilingual-v3
2322type CohereWikipediaEmbeddings struct {
2423 logger * slog.Logger
2524 lock sync.Mutex // protects downloads
@@ -81,7 +80,7 @@ func (c *CohereWikipediaEmbeddings) downloadFile(ctx context.Context, fileName s
8180 c .logger .Info ("downloading file" , slog .String ("file" , fileName ))
8281
8382 url := fmt .Sprintf (
84- "https://huggingface.co/datasets/Cohere/wikipedia-22-12-en-embeddings /resolve/main/data /%s?download=true" ,
83+ "https://huggingface.co/datasets/Cohere/wikipedia-2023-11-embed-multilingual-v3 /resolve/main/en /%s?download=true" ,
8584 fileName ,
8685 )
8786 req , err := http .NewRequestWithContext (ctx , http .MethodGet , url , nil )
@@ -161,15 +160,17 @@ func (cvs *cohereVectorSource) Vector(dims int) ([]float32, error) {
161160 }
162161}
163162
163+ const cohereWikipediaEmbeddingFileCount = 415
164+
164165func (cvs * cohereVectorSource ) loadNextFile (ctx context.Context ) error {
165166 fileIdx := cvs .nextFile
166- if fileIdx >= len ( cohereWikipediaEmbeddingFiles ) {
167- return errors . New ("no more files" )
167+ if fileIdx >= cohereWikipediaEmbeddingFileCount {
168+ return fmt . Errorf ("no more files" )
168169 }
169170 cvs .nextFile ++
170171
171172 var (
172- fname = cohereWikipediaEmbeddingFiles [ fileIdx ]
173+ fname = cohereEmbeddingFileName ( fileIdx )
173174 fp = cvs .orig .filePath (fname )
174175 )
175176 if _ , err := os .Stat (fp ); os .IsNotExist (err ) {
@@ -183,7 +184,7 @@ func (cvs *cohereVectorSource) loadNextFile(ctx context.Context) error {
183184 return fmt .Errorf ("failed to memory map file %s: %w" , fp , err )
184185 }
185186
186- vectorSeq , err := readVectorColumn (mmapped , 8 , 768 )
187+ vectorSeq , err := readVectorColumn (mmapped , 4 , 1024 )
187188 if err != nil {
188189 return fmt .Errorf ("failed to read vectors from file %s: %w" , fp , err )
189190 }
@@ -251,13 +252,13 @@ func (cts *cohereTextSource) getText() (string, error) {
251252
252253func (cts * cohereTextSource ) loadNextFile (ctx context.Context ) error {
253254 fileIdx := cts .nextFile
254- if fileIdx >= len ( cohereWikipediaEmbeddingFiles ) {
255- return errors . New ("no more files" )
255+ if fileIdx >= cohereWikipediaEmbeddingFileCount {
256+ return fmt . Errorf ("no more files" )
256257 }
257258 cts .nextFile ++
258259
259260 var (
260- fname = cohereWikipediaEmbeddingFiles [ fileIdx ]
261+ fname = cohereEmbeddingFileName ( fileIdx )
261262 fp = cts .orig .filePath (fname )
262263 )
263264 if _ , err := os .Stat (fp ); os .IsNotExist (err ) {
@@ -270,7 +271,7 @@ func (cts *cohereTextSource) loadNextFile(ctx context.Context) error {
270271 if err != nil {
271272 return fmt .Errorf ("failed to memory map file %s: %w" , fp , err )
272273 }
273- textSeq , err := readTextColumn (mmapped .Data , 2 )
274+ textSeq , err := readTextColumn (mmapped .Data , 3 )
274275 if err != nil {
275276 return fmt .Errorf ("failed to read texts from file %s: %w" , fp , err )
276277 }
@@ -280,6 +281,13 @@ func (cts *cohereTextSource) loadNextFile(ctx context.Context) error {
280281 return nil
281282}
282283
284+ // cohereEmbeddingFileName generates the filename for a Cohere embedding file
285+ // based on its index. Files are named as 4-digit zero-padded numbers, e.g.,
286+ // "0000.parquet", "0001.parquet", etc.
287+ func cohereEmbeddingFileName (idx int ) string {
288+ return fmt .Sprintf ("%04d.parquet" , idx )
289+ }
290+
283291func readTextColumn (fileContent []byte , column int64 ) (iter.Seq [string ], error ) {
284292 bf := buffer .NewBufferFileFromBytesNoAlloc (fileContent )
285293 pr , err := reader .NewParquetColumnReader (bf , 1 )
@@ -299,259 +307,3 @@ func readTextColumn(fileContent []byte, column int64) (iter.Seq[string], error)
299307 }
300308 }, nil
301309}
302-
303- var cohereWikipediaEmbeddingFiles = []string {
304- "train-00000-of-00253-8d3dffb4e6ef0304.parquet" ,
305- "train-00001-of-00253-2840fd802467fbe7.parquet" ,
306- "train-00002-of-00253-0ecc6c7ff8c4fa3c.parquet" ,
307- "train-00003-of-00253-32f0ed655d4213a4.parquet" ,
308- "train-00004-of-00253-8c33c9247a95d7f8.parquet" ,
309- "train-00005-of-00253-3bddd83c63a94665.parquet" ,
310- "train-00006-of-00253-85d4ae179ff9070e.parquet" ,
311- "train-00007-of-00253-8b41b4aec099f4b5.parquet" ,
312- "train-00008-of-00253-a19dcf6c4aa42b30.parquet" ,
313- "train-00009-of-00253-066c6e7f9bde2198.parquet" ,
314- "train-00010-of-00253-b374cd991cbb5156.parquet" ,
315- "train-00011-of-00253-795bbf4873fcc654.parquet" ,
316- "train-00012-of-00253-4917d4a02ad59415.parquet" ,
317- "train-00013-of-00253-acdc606afcb212c0.parquet" ,
318- "train-00014-of-00253-2aee8194c5707647.parquet" ,
319- "train-00015-of-00253-22d46ba400546eee.parquet" ,
320- "train-00016-of-00253-801956227c7f4aa3.parquet" ,
321- "train-00017-of-00253-6d74f63e2ad1b730.parquet" ,
322- "train-00018-of-00253-255f79226706d5c1.parquet" ,
323- "train-00019-of-00253-f4b2d38fe84c0ead.parquet" ,
324- "train-00020-of-00253-1de85ebaad99658e.parquet" ,
325- "train-00021-of-00253-67ad23e305e3295c.parquet" ,
326- "train-00022-of-00253-f9c5d70b7960cda7.parquet" ,
327- "train-00023-of-00253-b244988e872e8a1d.parquet" ,
328- "train-00024-of-00253-f423c747987257c1.parquet" ,
329- "train-00025-of-00253-29396090fdc2a78a.parquet" ,
330- "train-00026-of-00253-5ddb43fb1770e7f8.parquet" ,
331- "train-00027-of-00253-752a8bc5622d323e.parquet" ,
332- "train-00028-of-00253-da03da3e8033c428.parquet" ,
333- "train-00029-of-00253-5e2ba1902a8f5656.parquet" ,
334- "train-00030-of-00253-c4e515713a5cc8e0.parquet" ,
335- "train-00031-of-00253-5c2c714efc08b13c.parquet" ,
336- "train-00032-of-00253-c9efe2798824ac51.parquet" ,
337- "train-00033-of-00253-be9e14cac59e122e.parquet" ,
338- "train-00034-of-00253-ccc6b075279d9712.parquet" ,
339- "train-00035-of-00253-834ae2f2c5285a99.parquet" ,
340- "train-00036-of-00253-74856ee603b672f0.parquet" ,
341- "train-00037-of-00253-116477598973b86c.parquet" ,
342- "train-00038-of-00253-17b2331379fa72f6.parquet" ,
343- "train-00039-of-00253-7be12594af855ee2.parquet" ,
344- "train-00040-of-00253-84f19228bd41f6da.parquet" ,
345- "train-00041-of-00253-853105c503a5710a.parquet" ,
346- "train-00042-of-00253-f9187b7900752a8a.parquet" ,
347- "train-00043-of-00253-6e57290c3c8b5d5f.parquet" ,
348- "train-00044-of-00253-931ce22b00a4c15b.parquet" ,
349- "train-00045-of-00253-390980d0f55a07cb.parquet" ,
350- "train-00046-of-00253-89ed19ffa8016398.parquet" ,
351- "train-00047-of-00253-3c12f3ead94bfbfb.parquet" ,
352- "train-00048-of-00253-eaa84bcfb85c6f72.parquet" ,
353- "train-00049-of-00253-f60d394df69f0cfd.parquet" ,
354- "train-00050-of-00253-2e9e406b1fe1637d.parquet" ,
355- "train-00051-of-00253-a59dcef64c057c2f.parquet" ,
356- "train-00052-of-00253-c890ab67a7833e35.parquet" ,
357- "train-00053-of-00253-d2066657b608a39c.parquet" ,
358- "train-00054-of-00253-078aebe8abfb2ce8.parquet" ,
359- "train-00055-of-00253-005c356342d5bd48.parquet" ,
360- "train-00056-of-00253-c7aa34ae740fe73c.parquet" ,
361- "train-00057-of-00253-0ac063e31c203212.parquet" ,
362- "train-00058-of-00253-bd2bc91dc377d4a3.parquet" ,
363- "train-00059-of-00253-a49d9bfcc1c73245.parquet" ,
364- "train-00060-of-00253-66bcd667a0f51ca1.parquet" ,
365- "train-00061-of-00253-e03a6c0d915a0d72.parquet" ,
366- "train-00062-of-00253-164bc5605313cf93.parquet" ,
367- "train-00063-of-00253-eb485500a368fb6e.parquet" ,
368- "train-00064-of-00253-0017fc575755acc7.parquet" ,
369- "train-00065-of-00253-8c43415a5f2be2ce.parquet" ,
370- "train-00066-of-00253-ec9c5821e40f26f4.parquet" ,
371- "train-00067-of-00253-dc20a358a4dec4ef.parquet" ,
372- "train-00068-of-00253-a668d48636bd4ad6.parquet" ,
373- "train-00069-of-00253-e5adf7e0505b0ed9.parquet" ,
374- "train-00070-of-00253-a37d9c23f701c52c.parquet" ,
375- "train-00071-of-00253-2ad7eba51e43c84a.parquet" ,
376- "train-00072-of-00253-c750269b7e722e9c.parquet" ,
377- "train-00073-of-00253-306cdafd84214680.parquet" ,
378- "train-00074-of-00253-2d90645be188e613.parquet" ,
379- "train-00075-of-00253-0f2ea04b7339877e.parquet" ,
380- "train-00076-of-00253-93b8a7854df926bd.parquet" ,
381- "train-00077-of-00253-c721bb168a7ab59a.parquet" ,
382- "train-00078-of-00253-ae44665c35f92328.parquet" ,
383- "train-00079-of-00253-c7436cb8e9728f6e.parquet" ,
384- "train-00080-of-00253-49d0d951966b3c22.parquet" ,
385- "train-00081-of-00253-227c70e7b165e2b4.parquet" ,
386- "train-00082-of-00253-1269befa065af101.parquet" ,
387- "train-00083-of-00253-fc5e8a5fa73be0e7.parquet" ,
388- "train-00084-of-00253-f13a198f26475f4b.parquet" ,
389- "train-00085-of-00253-c1fa9e92d40e7c52.parquet" ,
390- "train-00086-of-00253-117382acbaf2d268.parquet" ,
391- "train-00087-of-00253-650b081492b280e8.parquet" ,
392- "train-00088-of-00253-8e74cc842f11c0ca.parquet" ,
393- "train-00089-of-00253-cf36d64831d2fc3a.parquet" ,
394- "train-00090-of-00253-71d852dfdb9d6cfc.parquet" ,
395- "train-00091-of-00253-0912a1fd533b07f1.parquet" ,
396- "train-00092-of-00253-a5c6e71c0c70fec6.parquet" ,
397- "train-00093-of-00253-d3d19c66e736f451.parquet" ,
398- "train-00094-of-00253-890a031231a7fa6b.parquet" ,
399- "train-00095-of-00253-ba267d0930a2f943.parquet" ,
400- "train-00096-of-00253-fd118f187a0a5a70.parquet" ,
401- "train-00097-of-00253-76271b0701e12b92.parquet" ,
402- "train-00098-of-00253-9be1c850b35663be.parquet" ,
403- "train-00099-of-00253-7b61e259ab69e144.parquet" ,
404- "train-00100-of-00253-1a7a4c5d83f9b58d.parquet" ,
405- "train-00101-of-00253-b18b780bfb3cb994.parquet" ,
406- "train-00102-of-00253-8adc6f0687e89f39.parquet" ,
407- "train-00103-of-00253-3f98bb88e6710c42.parquet" ,
408- "train-00104-of-00253-190d8475a05317d6.parquet" ,
409- "train-00105-of-00253-c3783ca560352491.parquet" ,
410- "train-00106-of-00253-805da5014fb3169f.parquet" ,
411- "train-00107-of-00253-f501e794311cc86c.parquet" ,
412- "train-00108-of-00253-7a6399540e7664be.parquet" ,
413- "train-00109-of-00253-6b04d06ed2afe35f.parquet" ,
414- "train-00110-of-00253-fc14df8eb2dba67d.parquet" ,
415- "train-00111-of-00253-35420c7229adc959.parquet" ,
416- "train-00112-of-00253-3ad9687af1fb6db1.parquet" ,
417- "train-00113-of-00253-1b778b3bc5ed1a5a.parquet" ,
418- "train-00114-of-00253-a5caaebba1f2381b.parquet" ,
419- "train-00115-of-00253-c66611cd4369dfea.parquet" ,
420- "train-00116-of-00253-ce5b4f38ffcefe3e.parquet" ,
421- "train-00117-of-00253-6937c17f9c6ee8b0.parquet" ,
422- "train-00118-of-00253-ffd636470e41df94.parquet" ,
423- "train-00119-of-00253-c716b06fe5c720ac.parquet" ,
424- "train-00120-of-00253-950fdfc157360aa5.parquet" ,
425- "train-00121-of-00253-4a433b375723ae25.parquet" ,
426- "train-00122-of-00253-4a048360997b48dc.parquet" ,
427- "train-00123-of-00253-f44a87ba12d3f01f.parquet" ,
428- "train-00124-of-00253-33590ef565c33d3a.parquet" ,
429- "train-00125-of-00253-5d535fbc76c00aff.parquet" ,
430- "train-00126-of-00253-542de0e05c14e36a.parquet" ,
431- "train-00127-of-00253-7caf3e5a3dbd9a93.parquet" ,
432- "train-00128-of-00253-fe0d9efbdafab63d.parquet" ,
433- "train-00129-of-00253-a7d26980242676a1.parquet" ,
434- "train-00130-of-00253-99020d76cab00a44.parquet" ,
435- "train-00131-of-00253-7e616cb3df356909.parquet" ,
436- "train-00132-of-00253-5ff1dae3276d5fd9.parquet" ,
437- "train-00133-of-00253-51dd993b5f02f14f.parquet" ,
438- "train-00134-of-00253-1cc963ff231ae094.parquet" ,
439- "train-00135-of-00253-368cb56b1fcb5abb.parquet" ,
440- "train-00136-of-00253-b7aa50b199c86e5d.parquet" ,
441- "train-00137-of-00253-074847c192f9275c.parquet" ,
442- "train-00138-of-00253-a8a9afd0622163b5.parquet" ,
443- "train-00139-of-00253-5f83fc25ba5044f5.parquet" ,
444- "train-00140-of-00253-66322d24a05da2b9.parquet" ,
445- "train-00141-of-00253-98a7b2c8c1c33319.parquet" ,
446- "train-00142-of-00253-2e7e6803e575bbdc.parquet" ,
447- "train-00143-of-00253-ef5ce0cc0fa39f59.parquet" ,
448- "train-00144-of-00253-476682d833ed9d9a.parquet" ,
449- "train-00145-of-00253-f686dc637743677e.parquet" ,
450- "train-00146-of-00253-622f1f7bac6eb765.parquet" ,
451- "train-00147-of-00253-97c56689522ea998.parquet" ,
452- "train-00148-of-00253-3cf60fddf4af7695.parquet" ,
453- "train-00149-of-00253-fd4e7bc14dffd06f.parquet" ,
454- "train-00150-of-00253-98e0ebf98f324b7f.parquet" ,
455- "train-00151-of-00253-2314099bf6f14c19.parquet" ,
456- "train-00152-of-00253-68218bc90e52b270.parquet" ,
457- "train-00153-of-00253-a96b804645dc1183.parquet" ,
458- "train-00154-of-00253-ce5ffde92833dc3c.parquet" ,
459- "train-00155-of-00253-8842f6af364e4344.parquet" ,
460- "train-00156-of-00253-b2e495e368e3140a.parquet" ,
461- "train-00157-of-00253-fc8d2f720317c51d.parquet" ,
462- "train-00158-of-00253-094972377866b6d7.parquet" ,
463- "train-00159-of-00253-fa1311efd6285c56.parquet" ,
464- "train-00160-of-00253-d481b51d41645f30.parquet" ,
465- "train-00161-of-00253-bb905214b7459ce8.parquet" ,
466- "train-00162-of-00253-8b6842d793b20eb9.parquet" ,
467- "train-00163-of-00253-c217aecaceda2002.parquet" ,
468- "train-00164-of-00253-8f892d491d0426cd.parquet" ,
469- "train-00165-of-00253-b3f683f5ca4ed0bd.parquet" ,
470- "train-00166-of-00253-25f7b96ce2cd2b06.parquet" ,
471- "train-00167-of-00253-53e0c16ecd561461.parquet" ,
472- "train-00168-of-00253-c3b5f215436ca395.parquet" ,
473- "train-00169-of-00253-b2514926fde0539c.parquet" ,
474- "train-00170-of-00253-bf76a82c77844ff7.parquet" ,
475- "train-00171-of-00253-7f3b9c96ce7cd722.parquet" ,
476- "train-00172-of-00253-b8d0406c16d4f34d.parquet" ,
477- "train-00173-of-00253-2ac71b08c877ed93.parquet" ,
478- "train-00174-of-00253-821fceeaf9217d62.parquet" ,
479- "train-00175-of-00253-433c2b9472f3cb6b.parquet" ,
480- "train-00176-of-00253-4ecb0791dff33e14.parquet" ,
481- "train-00177-of-00253-8409a99d82dc08d3.parquet" ,
482- "train-00178-of-00253-8e097439adcc1a8d.parquet" ,
483- "train-00179-of-00253-e9fe011f915f0696.parquet" ,
484- "train-00180-of-00253-7171103b699c1ad2.parquet" ,
485- "train-00181-of-00253-e123f247fd8991e8.parquet" ,
486- "train-00182-of-00253-2a1de2bb55bcf488.parquet" ,
487- "train-00183-of-00253-7a3974aa00c6fe7a.parquet" ,
488- "train-00184-of-00253-6c6df32e5749412b.parquet" ,
489- "train-00185-of-00253-311b99c7bdbb09df.parquet" ,
490- "train-00186-of-00253-b054a3b715d31e45.parquet" ,
491- "train-00187-of-00253-381e78f238e38a05.parquet" ,
492- "train-00188-of-00253-adfaaadc8e8c673e.parquet" ,
493- "train-00189-of-00253-bf26f0488b2a52c7.parquet" ,
494- "train-00190-of-00253-572755709abc79b8.parquet" ,
495- "train-00191-of-00253-654bf58b4a7e741e.parquet" ,
496- "train-00192-of-00253-6603b14592b5c863.parquet" ,
497- "train-00193-of-00253-c7462f0773e54ea5.parquet" ,
498- "train-00194-of-00253-ddd1253bcb446bd2.parquet" ,
499- "train-00195-of-00253-21b71cad8df04442.parquet" ,
500- "train-00196-of-00253-a9fc06012b336c8a.parquet" ,
501- "train-00197-of-00253-149761f60d6f82ce.parquet" ,
502- "train-00198-of-00253-5f84d2689e498ce3.parquet" ,
503- "train-00199-of-00253-6df174ec4afbc754.parquet" ,
504- "train-00200-of-00253-f25cbfbc4ecc46e5.parquet" ,
505- "train-00201-of-00253-c0120a0a641a83e5.parquet" ,
506- "train-00202-of-00253-71fd457b00397688.parquet" ,
507- "train-00203-of-00253-0147f12bab09cb08.parquet" ,
508- "train-00204-of-00253-39a83604836d314f.parquet" ,
509- "train-00205-of-00253-2b14def07f4131d0.parquet" ,
510- "train-00206-of-00253-898a272d08173235.parquet" ,
511- "train-00207-of-00253-c77b10aa2f513766.parquet" ,
512- "train-00208-of-00253-232e02b3b4410b93.parquet" ,
513- "train-00209-of-00253-99d95f12a455e6f9.parquet" ,
514- "train-00210-of-00253-5e1e7f42a0538659.parquet" ,
515- "train-00211-of-00253-9e8789ed7b9d09a0.parquet" ,
516- "train-00212-of-00253-a322bc59c67a8eb7.parquet" ,
517- "train-00213-of-00253-d6cdc38743c7166a.parquet" ,
518- "train-00214-of-00253-db75b992eef7e6f3.parquet" ,
519- "train-00215-of-00253-b10c2c91a0ff0461.parquet" ,
520- "train-00216-of-00253-32fd09d79b4bfcb8.parquet" ,
521- "train-00217-of-00253-09fe8e37142afff0.parquet" ,
522- "train-00218-of-00253-9ba4f606c1f890a7.parquet" ,
523- "train-00219-of-00253-77e5f74f50608c84.parquet" ,
524- "train-00220-of-00253-0f358981f5c4b0ea.parquet" ,
525- "train-00221-of-00253-d63cb1b3f67ca2e3.parquet" ,
526- "train-00222-of-00253-e0ae1cc95eb9162f.parquet" ,
527- "train-00223-of-00253-92b87e0ca46a851e.parquet" ,
528- "train-00224-of-00253-95caa824de31383b.parquet" ,
529- "train-00225-of-00253-f18735143103eb3d.parquet" ,
530- "train-00226-of-00253-9e5c2a122e1ee14c.parquet" ,
531- "train-00227-of-00253-18cd94c647ab72c7.parquet" ,
532- "train-00228-of-00253-67f15d553a91ec1c.parquet" ,
533- "train-00229-of-00253-5fd86b234ddf06c4.parquet" ,
534- "train-00230-of-00253-f769913c0527d080.parquet" ,
535- "train-00231-of-00253-7d929a7d638988f1.parquet" ,
536- "train-00232-of-00253-6d7d44691652d499.parquet" ,
537- "train-00233-of-00253-37c0041e33745541.parquet" ,
538- "train-00234-of-00253-9198599261898de8.parquet" ,
539- "train-00235-of-00253-781e2a384bb1d5f3.parquet" ,
540- "train-00236-of-00253-7520b54396b5716f.parquet" ,
541- "train-00237-of-00253-ac832e864517a5c0.parquet" ,
542- "train-00238-of-00253-228a746b4c50d88a.parquet" ,
543- "train-00239-of-00253-0c922ef3686b8db7.parquet" ,
544- "train-00240-of-00253-ebe435b211e745f8.parquet" ,
545- "train-00241-of-00253-7547d4989a92e648.parquet" ,
546- "train-00242-of-00253-08106b9083591997.parquet" ,
547- "train-00243-of-00253-8ec89bb8403bcfbe.parquet" ,
548- "train-00244-of-00253-45b346edb004bb23.parquet" ,
549- "train-00245-of-00253-40e44253337b5228.parquet" ,
550- "train-00246-of-00253-0a5c5d98e0e009a1.parquet" ,
551- "train-00247-of-00253-1290ad384174b5cb.parquet" ,
552- "train-00248-of-00253-891cf07cd5ff0b86.parquet" ,
553- "train-00249-of-00253-b81c028d5c1ec216.parquet" ,
554- "train-00250-of-00253-f335644d88aa7e77.parquet" ,
555- "train-00251-of-00253-768f2f477249701c.parquet" ,
556- "train-00252-of-00253-6c465b1c097702e9.parquet" ,
557- }
0 commit comments