Skip to content

Commit 3002c42

Browse files
committed
fix wiki embeddings
The old ones were deleted.
1 parent 4385fad commit 3002c42

File tree

2 files changed

+38
-532
lines changed

2 files changed

+38
-532
lines changed

cmd/nightly/datasource_cohere.go

Lines changed: 19 additions & 267 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ package main
22

33
import (
44
"context"
5-
"errors"
65
"fmt"
76
"iter"
87
"log/slog"
@@ -19,7 +18,7 @@ import (
1918
)
2019

2120
// CohereWikipediaEmbeddings provides template data from Cohere's Wikipedia embeddings.
22-
// See: https://huggingface.co/datasets/Cohere/wikipedia-22-12-en-embeddings
21+
// See: https://huggingface.co/datasets/Cohere/wikipedia-2023-11-embed-multilingual-v3
2322
type CohereWikipediaEmbeddings struct {
2423
logger *slog.Logger
2524
lock sync.Mutex // protects downloads
@@ -81,7 +80,7 @@ func (c *CohereWikipediaEmbeddings) downloadFile(ctx context.Context, fileName s
8180
c.logger.Info("downloading file", slog.String("file", fileName))
8281

8382
url := fmt.Sprintf(
84-
"https://huggingface.co/datasets/Cohere/wikipedia-22-12-en-embeddings/resolve/main/data/%s?download=true",
83+
"https://huggingface.co/datasets/Cohere/wikipedia-2023-11-embed-multilingual-v3/resolve/main/en/%s?download=true",
8584
fileName,
8685
)
8786
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
@@ -161,15 +160,17 @@ func (cvs *cohereVectorSource) Vector(dims int) ([]float32, error) {
161160
}
162161
}
163162

163+
const cohereWikipediaEmbeddingFileCount = 415
164+
164165
func (cvs *cohereVectorSource) loadNextFile(ctx context.Context) error {
165166
fileIdx := cvs.nextFile
166-
if fileIdx >= len(cohereWikipediaEmbeddingFiles) {
167-
return errors.New("no more files")
167+
if fileIdx >= cohereWikipediaEmbeddingFileCount {
168+
return fmt.Errorf("no more files")
168169
}
169170
cvs.nextFile++
170171

171172
var (
172-
fname = cohereWikipediaEmbeddingFiles[fileIdx]
173+
fname = cohereEmbeddingFileName(fileIdx)
173174
fp = cvs.orig.filePath(fname)
174175
)
175176
if _, err := os.Stat(fp); os.IsNotExist(err) {
@@ -183,7 +184,7 @@ func (cvs *cohereVectorSource) loadNextFile(ctx context.Context) error {
183184
return fmt.Errorf("failed to memory map file %s: %w", fp, err)
184185
}
185186

186-
vectorSeq, err := readVectorColumn(mmapped, 8, 768)
187+
vectorSeq, err := readVectorColumn(mmapped, 4, 1024)
187188
if err != nil {
188189
return fmt.Errorf("failed to read vectors from file %s: %w", fp, err)
189190
}
@@ -251,13 +252,13 @@ func (cts *cohereTextSource) getText() (string, error) {
251252

252253
func (cts *cohereTextSource) loadNextFile(ctx context.Context) error {
253254
fileIdx := cts.nextFile
254-
if fileIdx >= len(cohereWikipediaEmbeddingFiles) {
255-
return errors.New("no more files")
255+
if fileIdx >= cohereWikipediaEmbeddingFileCount {
256+
return fmt.Errorf("no more files")
256257
}
257258
cts.nextFile++
258259

259260
var (
260-
fname = cohereWikipediaEmbeddingFiles[fileIdx]
261+
fname = cohereEmbeddingFileName(fileIdx)
261262
fp = cts.orig.filePath(fname)
262263
)
263264
if _, err := os.Stat(fp); os.IsNotExist(err) {
@@ -270,7 +271,7 @@ func (cts *cohereTextSource) loadNextFile(ctx context.Context) error {
270271
if err != nil {
271272
return fmt.Errorf("failed to memory map file %s: %w", fp, err)
272273
}
273-
textSeq, err := readTextColumn(mmapped.Data, 2)
274+
textSeq, err := readTextColumn(mmapped.Data, 3)
274275
if err != nil {
275276
return fmt.Errorf("failed to read texts from file %s: %w", fp, err)
276277
}
@@ -280,6 +281,13 @@ func (cts *cohereTextSource) loadNextFile(ctx context.Context) error {
280281
return nil
281282
}
282283

284+
// cohereEmbeddingFileName generates the filename for a Cohere embedding file
285+
// based on its index. Files are named as 4-digit zero-padded numbers, e.g.,
286+
// "0000.parquet", "0001.parquet", etc.
287+
func cohereEmbeddingFileName(idx int) string {
288+
return fmt.Sprintf("%04d.parquet", idx)
289+
}
290+
283291
func readTextColumn(fileContent []byte, column int64) (iter.Seq[string], error) {
284292
bf := buffer.NewBufferFileFromBytesNoAlloc(fileContent)
285293
pr, err := reader.NewParquetColumnReader(bf, 1)
@@ -299,259 +307,3 @@ func readTextColumn(fileContent []byte, column int64) (iter.Seq[string], error)
299307
}
300308
}, nil
301309
}
302-
303-
var cohereWikipediaEmbeddingFiles = []string{
304-
"train-00000-of-00253-8d3dffb4e6ef0304.parquet",
305-
"train-00001-of-00253-2840fd802467fbe7.parquet",
306-
"train-00002-of-00253-0ecc6c7ff8c4fa3c.parquet",
307-
"train-00003-of-00253-32f0ed655d4213a4.parquet",
308-
"train-00004-of-00253-8c33c9247a95d7f8.parquet",
309-
"train-00005-of-00253-3bddd83c63a94665.parquet",
310-
"train-00006-of-00253-85d4ae179ff9070e.parquet",
311-
"train-00007-of-00253-8b41b4aec099f4b5.parquet",
312-
"train-00008-of-00253-a19dcf6c4aa42b30.parquet",
313-
"train-00009-of-00253-066c6e7f9bde2198.parquet",
314-
"train-00010-of-00253-b374cd991cbb5156.parquet",
315-
"train-00011-of-00253-795bbf4873fcc654.parquet",
316-
"train-00012-of-00253-4917d4a02ad59415.parquet",
317-
"train-00013-of-00253-acdc606afcb212c0.parquet",
318-
"train-00014-of-00253-2aee8194c5707647.parquet",
319-
"train-00015-of-00253-22d46ba400546eee.parquet",
320-
"train-00016-of-00253-801956227c7f4aa3.parquet",
321-
"train-00017-of-00253-6d74f63e2ad1b730.parquet",
322-
"train-00018-of-00253-255f79226706d5c1.parquet",
323-
"train-00019-of-00253-f4b2d38fe84c0ead.parquet",
324-
"train-00020-of-00253-1de85ebaad99658e.parquet",
325-
"train-00021-of-00253-67ad23e305e3295c.parquet",
326-
"train-00022-of-00253-f9c5d70b7960cda7.parquet",
327-
"train-00023-of-00253-b244988e872e8a1d.parquet",
328-
"train-00024-of-00253-f423c747987257c1.parquet",
329-
"train-00025-of-00253-29396090fdc2a78a.parquet",
330-
"train-00026-of-00253-5ddb43fb1770e7f8.parquet",
331-
"train-00027-of-00253-752a8bc5622d323e.parquet",
332-
"train-00028-of-00253-da03da3e8033c428.parquet",
333-
"train-00029-of-00253-5e2ba1902a8f5656.parquet",
334-
"train-00030-of-00253-c4e515713a5cc8e0.parquet",
335-
"train-00031-of-00253-5c2c714efc08b13c.parquet",
336-
"train-00032-of-00253-c9efe2798824ac51.parquet",
337-
"train-00033-of-00253-be9e14cac59e122e.parquet",
338-
"train-00034-of-00253-ccc6b075279d9712.parquet",
339-
"train-00035-of-00253-834ae2f2c5285a99.parquet",
340-
"train-00036-of-00253-74856ee603b672f0.parquet",
341-
"train-00037-of-00253-116477598973b86c.parquet",
342-
"train-00038-of-00253-17b2331379fa72f6.parquet",
343-
"train-00039-of-00253-7be12594af855ee2.parquet",
344-
"train-00040-of-00253-84f19228bd41f6da.parquet",
345-
"train-00041-of-00253-853105c503a5710a.parquet",
346-
"train-00042-of-00253-f9187b7900752a8a.parquet",
347-
"train-00043-of-00253-6e57290c3c8b5d5f.parquet",
348-
"train-00044-of-00253-931ce22b00a4c15b.parquet",
349-
"train-00045-of-00253-390980d0f55a07cb.parquet",
350-
"train-00046-of-00253-89ed19ffa8016398.parquet",
351-
"train-00047-of-00253-3c12f3ead94bfbfb.parquet",
352-
"train-00048-of-00253-eaa84bcfb85c6f72.parquet",
353-
"train-00049-of-00253-f60d394df69f0cfd.parquet",
354-
"train-00050-of-00253-2e9e406b1fe1637d.parquet",
355-
"train-00051-of-00253-a59dcef64c057c2f.parquet",
356-
"train-00052-of-00253-c890ab67a7833e35.parquet",
357-
"train-00053-of-00253-d2066657b608a39c.parquet",
358-
"train-00054-of-00253-078aebe8abfb2ce8.parquet",
359-
"train-00055-of-00253-005c356342d5bd48.parquet",
360-
"train-00056-of-00253-c7aa34ae740fe73c.parquet",
361-
"train-00057-of-00253-0ac063e31c203212.parquet",
362-
"train-00058-of-00253-bd2bc91dc377d4a3.parquet",
363-
"train-00059-of-00253-a49d9bfcc1c73245.parquet",
364-
"train-00060-of-00253-66bcd667a0f51ca1.parquet",
365-
"train-00061-of-00253-e03a6c0d915a0d72.parquet",
366-
"train-00062-of-00253-164bc5605313cf93.parquet",
367-
"train-00063-of-00253-eb485500a368fb6e.parquet",
368-
"train-00064-of-00253-0017fc575755acc7.parquet",
369-
"train-00065-of-00253-8c43415a5f2be2ce.parquet",
370-
"train-00066-of-00253-ec9c5821e40f26f4.parquet",
371-
"train-00067-of-00253-dc20a358a4dec4ef.parquet",
372-
"train-00068-of-00253-a668d48636bd4ad6.parquet",
373-
"train-00069-of-00253-e5adf7e0505b0ed9.parquet",
374-
"train-00070-of-00253-a37d9c23f701c52c.parquet",
375-
"train-00071-of-00253-2ad7eba51e43c84a.parquet",
376-
"train-00072-of-00253-c750269b7e722e9c.parquet",
377-
"train-00073-of-00253-306cdafd84214680.parquet",
378-
"train-00074-of-00253-2d90645be188e613.parquet",
379-
"train-00075-of-00253-0f2ea04b7339877e.parquet",
380-
"train-00076-of-00253-93b8a7854df926bd.parquet",
381-
"train-00077-of-00253-c721bb168a7ab59a.parquet",
382-
"train-00078-of-00253-ae44665c35f92328.parquet",
383-
"train-00079-of-00253-c7436cb8e9728f6e.parquet",
384-
"train-00080-of-00253-49d0d951966b3c22.parquet",
385-
"train-00081-of-00253-227c70e7b165e2b4.parquet",
386-
"train-00082-of-00253-1269befa065af101.parquet",
387-
"train-00083-of-00253-fc5e8a5fa73be0e7.parquet",
388-
"train-00084-of-00253-f13a198f26475f4b.parquet",
389-
"train-00085-of-00253-c1fa9e92d40e7c52.parquet",
390-
"train-00086-of-00253-117382acbaf2d268.parquet",
391-
"train-00087-of-00253-650b081492b280e8.parquet",
392-
"train-00088-of-00253-8e74cc842f11c0ca.parquet",
393-
"train-00089-of-00253-cf36d64831d2fc3a.parquet",
394-
"train-00090-of-00253-71d852dfdb9d6cfc.parquet",
395-
"train-00091-of-00253-0912a1fd533b07f1.parquet",
396-
"train-00092-of-00253-a5c6e71c0c70fec6.parquet",
397-
"train-00093-of-00253-d3d19c66e736f451.parquet",
398-
"train-00094-of-00253-890a031231a7fa6b.parquet",
399-
"train-00095-of-00253-ba267d0930a2f943.parquet",
400-
"train-00096-of-00253-fd118f187a0a5a70.parquet",
401-
"train-00097-of-00253-76271b0701e12b92.parquet",
402-
"train-00098-of-00253-9be1c850b35663be.parquet",
403-
"train-00099-of-00253-7b61e259ab69e144.parquet",
404-
"train-00100-of-00253-1a7a4c5d83f9b58d.parquet",
405-
"train-00101-of-00253-b18b780bfb3cb994.parquet",
406-
"train-00102-of-00253-8adc6f0687e89f39.parquet",
407-
"train-00103-of-00253-3f98bb88e6710c42.parquet",
408-
"train-00104-of-00253-190d8475a05317d6.parquet",
409-
"train-00105-of-00253-c3783ca560352491.parquet",
410-
"train-00106-of-00253-805da5014fb3169f.parquet",
411-
"train-00107-of-00253-f501e794311cc86c.parquet",
412-
"train-00108-of-00253-7a6399540e7664be.parquet",
413-
"train-00109-of-00253-6b04d06ed2afe35f.parquet",
414-
"train-00110-of-00253-fc14df8eb2dba67d.parquet",
415-
"train-00111-of-00253-35420c7229adc959.parquet",
416-
"train-00112-of-00253-3ad9687af1fb6db1.parquet",
417-
"train-00113-of-00253-1b778b3bc5ed1a5a.parquet",
418-
"train-00114-of-00253-a5caaebba1f2381b.parquet",
419-
"train-00115-of-00253-c66611cd4369dfea.parquet",
420-
"train-00116-of-00253-ce5b4f38ffcefe3e.parquet",
421-
"train-00117-of-00253-6937c17f9c6ee8b0.parquet",
422-
"train-00118-of-00253-ffd636470e41df94.parquet",
423-
"train-00119-of-00253-c716b06fe5c720ac.parquet",
424-
"train-00120-of-00253-950fdfc157360aa5.parquet",
425-
"train-00121-of-00253-4a433b375723ae25.parquet",
426-
"train-00122-of-00253-4a048360997b48dc.parquet",
427-
"train-00123-of-00253-f44a87ba12d3f01f.parquet",
428-
"train-00124-of-00253-33590ef565c33d3a.parquet",
429-
"train-00125-of-00253-5d535fbc76c00aff.parquet",
430-
"train-00126-of-00253-542de0e05c14e36a.parquet",
431-
"train-00127-of-00253-7caf3e5a3dbd9a93.parquet",
432-
"train-00128-of-00253-fe0d9efbdafab63d.parquet",
433-
"train-00129-of-00253-a7d26980242676a1.parquet",
434-
"train-00130-of-00253-99020d76cab00a44.parquet",
435-
"train-00131-of-00253-7e616cb3df356909.parquet",
436-
"train-00132-of-00253-5ff1dae3276d5fd9.parquet",
437-
"train-00133-of-00253-51dd993b5f02f14f.parquet",
438-
"train-00134-of-00253-1cc963ff231ae094.parquet",
439-
"train-00135-of-00253-368cb56b1fcb5abb.parquet",
440-
"train-00136-of-00253-b7aa50b199c86e5d.parquet",
441-
"train-00137-of-00253-074847c192f9275c.parquet",
442-
"train-00138-of-00253-a8a9afd0622163b5.parquet",
443-
"train-00139-of-00253-5f83fc25ba5044f5.parquet",
444-
"train-00140-of-00253-66322d24a05da2b9.parquet",
445-
"train-00141-of-00253-98a7b2c8c1c33319.parquet",
446-
"train-00142-of-00253-2e7e6803e575bbdc.parquet",
447-
"train-00143-of-00253-ef5ce0cc0fa39f59.parquet",
448-
"train-00144-of-00253-476682d833ed9d9a.parquet",
449-
"train-00145-of-00253-f686dc637743677e.parquet",
450-
"train-00146-of-00253-622f1f7bac6eb765.parquet",
451-
"train-00147-of-00253-97c56689522ea998.parquet",
452-
"train-00148-of-00253-3cf60fddf4af7695.parquet",
453-
"train-00149-of-00253-fd4e7bc14dffd06f.parquet",
454-
"train-00150-of-00253-98e0ebf98f324b7f.parquet",
455-
"train-00151-of-00253-2314099bf6f14c19.parquet",
456-
"train-00152-of-00253-68218bc90e52b270.parquet",
457-
"train-00153-of-00253-a96b804645dc1183.parquet",
458-
"train-00154-of-00253-ce5ffde92833dc3c.parquet",
459-
"train-00155-of-00253-8842f6af364e4344.parquet",
460-
"train-00156-of-00253-b2e495e368e3140a.parquet",
461-
"train-00157-of-00253-fc8d2f720317c51d.parquet",
462-
"train-00158-of-00253-094972377866b6d7.parquet",
463-
"train-00159-of-00253-fa1311efd6285c56.parquet",
464-
"train-00160-of-00253-d481b51d41645f30.parquet",
465-
"train-00161-of-00253-bb905214b7459ce8.parquet",
466-
"train-00162-of-00253-8b6842d793b20eb9.parquet",
467-
"train-00163-of-00253-c217aecaceda2002.parquet",
468-
"train-00164-of-00253-8f892d491d0426cd.parquet",
469-
"train-00165-of-00253-b3f683f5ca4ed0bd.parquet",
470-
"train-00166-of-00253-25f7b96ce2cd2b06.parquet",
471-
"train-00167-of-00253-53e0c16ecd561461.parquet",
472-
"train-00168-of-00253-c3b5f215436ca395.parquet",
473-
"train-00169-of-00253-b2514926fde0539c.parquet",
474-
"train-00170-of-00253-bf76a82c77844ff7.parquet",
475-
"train-00171-of-00253-7f3b9c96ce7cd722.parquet",
476-
"train-00172-of-00253-b8d0406c16d4f34d.parquet",
477-
"train-00173-of-00253-2ac71b08c877ed93.parquet",
478-
"train-00174-of-00253-821fceeaf9217d62.parquet",
479-
"train-00175-of-00253-433c2b9472f3cb6b.parquet",
480-
"train-00176-of-00253-4ecb0791dff33e14.parquet",
481-
"train-00177-of-00253-8409a99d82dc08d3.parquet",
482-
"train-00178-of-00253-8e097439adcc1a8d.parquet",
483-
"train-00179-of-00253-e9fe011f915f0696.parquet",
484-
"train-00180-of-00253-7171103b699c1ad2.parquet",
485-
"train-00181-of-00253-e123f247fd8991e8.parquet",
486-
"train-00182-of-00253-2a1de2bb55bcf488.parquet",
487-
"train-00183-of-00253-7a3974aa00c6fe7a.parquet",
488-
"train-00184-of-00253-6c6df32e5749412b.parquet",
489-
"train-00185-of-00253-311b99c7bdbb09df.parquet",
490-
"train-00186-of-00253-b054a3b715d31e45.parquet",
491-
"train-00187-of-00253-381e78f238e38a05.parquet",
492-
"train-00188-of-00253-adfaaadc8e8c673e.parquet",
493-
"train-00189-of-00253-bf26f0488b2a52c7.parquet",
494-
"train-00190-of-00253-572755709abc79b8.parquet",
495-
"train-00191-of-00253-654bf58b4a7e741e.parquet",
496-
"train-00192-of-00253-6603b14592b5c863.parquet",
497-
"train-00193-of-00253-c7462f0773e54ea5.parquet",
498-
"train-00194-of-00253-ddd1253bcb446bd2.parquet",
499-
"train-00195-of-00253-21b71cad8df04442.parquet",
500-
"train-00196-of-00253-a9fc06012b336c8a.parquet",
501-
"train-00197-of-00253-149761f60d6f82ce.parquet",
502-
"train-00198-of-00253-5f84d2689e498ce3.parquet",
503-
"train-00199-of-00253-6df174ec4afbc754.parquet",
504-
"train-00200-of-00253-f25cbfbc4ecc46e5.parquet",
505-
"train-00201-of-00253-c0120a0a641a83e5.parquet",
506-
"train-00202-of-00253-71fd457b00397688.parquet",
507-
"train-00203-of-00253-0147f12bab09cb08.parquet",
508-
"train-00204-of-00253-39a83604836d314f.parquet",
509-
"train-00205-of-00253-2b14def07f4131d0.parquet",
510-
"train-00206-of-00253-898a272d08173235.parquet",
511-
"train-00207-of-00253-c77b10aa2f513766.parquet",
512-
"train-00208-of-00253-232e02b3b4410b93.parquet",
513-
"train-00209-of-00253-99d95f12a455e6f9.parquet",
514-
"train-00210-of-00253-5e1e7f42a0538659.parquet",
515-
"train-00211-of-00253-9e8789ed7b9d09a0.parquet",
516-
"train-00212-of-00253-a322bc59c67a8eb7.parquet",
517-
"train-00213-of-00253-d6cdc38743c7166a.parquet",
518-
"train-00214-of-00253-db75b992eef7e6f3.parquet",
519-
"train-00215-of-00253-b10c2c91a0ff0461.parquet",
520-
"train-00216-of-00253-32fd09d79b4bfcb8.parquet",
521-
"train-00217-of-00253-09fe8e37142afff0.parquet",
522-
"train-00218-of-00253-9ba4f606c1f890a7.parquet",
523-
"train-00219-of-00253-77e5f74f50608c84.parquet",
524-
"train-00220-of-00253-0f358981f5c4b0ea.parquet",
525-
"train-00221-of-00253-d63cb1b3f67ca2e3.parquet",
526-
"train-00222-of-00253-e0ae1cc95eb9162f.parquet",
527-
"train-00223-of-00253-92b87e0ca46a851e.parquet",
528-
"train-00224-of-00253-95caa824de31383b.parquet",
529-
"train-00225-of-00253-f18735143103eb3d.parquet",
530-
"train-00226-of-00253-9e5c2a122e1ee14c.parquet",
531-
"train-00227-of-00253-18cd94c647ab72c7.parquet",
532-
"train-00228-of-00253-67f15d553a91ec1c.parquet",
533-
"train-00229-of-00253-5fd86b234ddf06c4.parquet",
534-
"train-00230-of-00253-f769913c0527d080.parquet",
535-
"train-00231-of-00253-7d929a7d638988f1.parquet",
536-
"train-00232-of-00253-6d7d44691652d499.parquet",
537-
"train-00233-of-00253-37c0041e33745541.parquet",
538-
"train-00234-of-00253-9198599261898de8.parquet",
539-
"train-00235-of-00253-781e2a384bb1d5f3.parquet",
540-
"train-00236-of-00253-7520b54396b5716f.parquet",
541-
"train-00237-of-00253-ac832e864517a5c0.parquet",
542-
"train-00238-of-00253-228a746b4c50d88a.parquet",
543-
"train-00239-of-00253-0c922ef3686b8db7.parquet",
544-
"train-00240-of-00253-ebe435b211e745f8.parquet",
545-
"train-00241-of-00253-7547d4989a92e648.parquet",
546-
"train-00242-of-00253-08106b9083591997.parquet",
547-
"train-00243-of-00253-8ec89bb8403bcfbe.parquet",
548-
"train-00244-of-00253-45b346edb004bb23.parquet",
549-
"train-00245-of-00253-40e44253337b5228.parquet",
550-
"train-00246-of-00253-0a5c5d98e0e009a1.parquet",
551-
"train-00247-of-00253-1290ad384174b5cb.parquet",
552-
"train-00248-of-00253-891cf07cd5ff0b86.parquet",
553-
"train-00249-of-00253-b81c028d5c1ec216.parquet",
554-
"train-00250-of-00253-f335644d88aa7e77.parquet",
555-
"train-00251-of-00253-768f2f477249701c.parquet",
556-
"train-00252-of-00253-6c465b1c097702e9.parquet",
557-
}

0 commit comments

Comments
 (0)