|
7 | 7 | from pathlib import Path |
8 | 8 | from typing import TYPE_CHECKING, Any |
9 | 9 |
|
10 | | -from mteb._create_dataloaders import ( |
11 | | - create_dataloader, |
12 | | -) |
| 10 | +from mteb._create_dataloaders import create_dataloader |
13 | 11 | from mteb._requires_package import requires_package |
14 | 12 | from mteb.models.abs_encoder import AbsEncoder, get_prompt |
15 | 13 | from mteb.models.model_meta import ModelMeta, ScoringFunction |
@@ -487,17 +485,255 @@ def _encode( |
487 | 485 | use_instructions=False, |
488 | 486 | adapted_from="Alibaba-NLP/gte-modernbert-base", |
489 | 487 | superseded_by=None, |
| 488 | + training_datasets={ |
| 489 | + "MSMARCO", |
| 490 | + "MSMARCOHardNegatives", |
| 491 | + "NanoMSMARCORetrieval", |
| 492 | + "NQ", |
| 493 | + "NQHardNegatives", |
| 494 | + "NanoNQRetrieval", |
| 495 | + "HotpotQA", |
| 496 | + "HotpotQAHardNegatives", |
| 497 | + "CodeSearchNet", |
| 498 | + "FEVER", |
| 499 | + "DBPedia", |
| 500 | + "DBPediaHardNegatives.v2", |
| 501 | + "NanoDBPediaRetrieval", |
| 502 | + "TRECDL2019", |
| 503 | + "TRECDL2020", |
| 504 | + "CornStack", |
| 505 | + }, |
| 506 | + citation="""@misc{GTE-ModernColBERT, |
| 507 | + title={GTE-ModernColBERT}, |
| 508 | + author={Chaffin, Antoine}, |
| 509 | + url={https://huggingface.co/lightonai/GTE-ModernColBERT-v1}, |
| 510 | + year={2025} |
| 511 | +}""", |
| 512 | +) |
| 513 | + |
| 514 | +lightonai__late_on_code_pretrain = ModelMeta( |
| 515 | + loader=MultiVectorModel, |
| 516 | + name="lightonai/LateOn-Code-pretrain", |
| 517 | + model_type=["late-interaction"], |
| 518 | + languages=[ |
| 519 | + "eng-Latn", |
| 520 | + "python-Code", |
| 521 | + "go-Code", |
| 522 | + "java-Code", |
| 523 | + "javascript-Code", |
| 524 | + "ruby-Code", |
| 525 | + "php-Code", |
| 526 | + ], |
| 527 | + open_weights=True, |
| 528 | + revision="71251a6ee61eee488de7e3ae29f5fb4c3c94699b", |
| 529 | + public_training_code="https://github.com/lightonai/pylate/blob/main/examples/train/lateon_code/pre-training.py", |
| 530 | + public_training_data="https://huggingface.co/datasets/lightonai/cornstack", |
| 531 | + release_date="2026-02-12", |
| 532 | + n_parameters=int(149 * 1e6), |
| 533 | + n_embedding_parameters=38684160, |
| 534 | + memory_usage_mb=568, |
| 535 | + max_tokens=8192, |
| 536 | + embed_dim=128, |
| 537 | + license="apache-2.0", |
| 538 | + similarity_fn_name="MaxSim", |
| 539 | + framework=["PyLate", "ColBERT", "safetensors", "Sentence Transformers"], |
| 540 | + reference="https://huggingface.co/lightonai/LateOn-Code-pretrain", |
| 541 | + use_instructions=False, |
| 542 | + superseded_by=None, |
| 543 | + training_datasets={ |
| 544 | + "MSMARCO", |
| 545 | + "MSMARCOHardNegatives", |
| 546 | + "NanoMSMARCORetrieval", |
| 547 | + "NQ", |
| 548 | + "NQHardNegatives", |
| 549 | + "NanoNQRetrieval", |
| 550 | + "HotpotQA", |
| 551 | + "HotpotQAHardNegatives", |
| 552 | + "CodeSearchNet", |
| 553 | + "FEVER", |
| 554 | + "DBPedia", |
| 555 | + "DBPediaHardNegatives.v2", |
| 556 | + "NanoDBPediaRetrieval", |
| 557 | + "TRECDL2019", |
| 558 | + "TRECDL2020", |
| 559 | + "CornStack", |
| 560 | + }, |
| 561 | + citation="""@misc{LateOn-Code, |
| 562 | + title = {LateOn-Code: a Family of State-Of-The-Art Late Interaction Code Retrieval Models}, |
| 563 | + author = {Chaffin, Antoine}, |
| 564 | + url = {https://huggingface.co/collections/lightonai/lateon-code}, |
| 565 | + year = {2026} |
| 566 | +}""", |
| 567 | +) |
| 568 | + |
| 569 | + |
| 570 | +lightonai__late_on_code = ModelMeta( |
| 571 | + loader=MultiVectorModel, |
| 572 | + name="lightonai/LateOn-Code", |
| 573 | + model_type=["late-interaction"], |
| 574 | + languages=[ |
| 575 | + "eng-Latn", |
| 576 | + "python-Code", |
| 577 | + "go-Code", |
| 578 | + "java-Code", |
| 579 | + "javascript-Code", |
| 580 | + "ruby-Code", |
| 581 | + "php-Code", |
| 582 | + ], |
| 583 | + open_weights=True, |
| 584 | + revision="734b659a57935ef50562d79581c3ff1f8d825c93", |
| 585 | + public_training_code="https://github.com/lightonai/pylate/blob/main/examples/train/lateon_code/fine_tuning.py", |
| 586 | + public_training_data="https://huggingface.co/datasets/lightonai/nv-embed-supervised-distill-dedup-code", |
| 587 | + release_date="2026-02-12", |
| 588 | + n_parameters=int(149 * 1e6), |
| 589 | + n_embedding_parameters=38684160, |
| 590 | + memory_usage_mb=568, |
| 591 | + max_tokens=8192, |
| 592 | + embed_dim=128, |
| 593 | + license="apache-2.0", |
| 594 | + similarity_fn_name="MaxSim", |
| 595 | + framework=["PyLate", "ColBERT", "safetensors", "Sentence Transformers"], |
| 596 | + reference="https://huggingface.co/lightonai/LateOn-Code", |
| 597 | + use_instructions=False, |
| 598 | + adapted_from="lightonai/LateOn-Code-pretrain", |
| 599 | + superseded_by=None, |
490 | 600 | training_datasets={ |
491 | 601 | "MSMARCO", |
492 | 602 | "mMARCO-NL", |
| 603 | + "CornStack", |
| 604 | + "AppsRetrieval", |
| 605 | + "SyntheticText2SQL", |
| 606 | + "CosQA", |
| 607 | + "CodeFeedbackMT", |
| 608 | + "CodeFeedbackST", |
| 609 | + "StackOverflowQA", |
| 610 | + "CodeTransOceanContest", |
| 611 | + "CodeTransOceanDL", |
| 612 | + "CodeSearchNetRetrieval", |
| 613 | + "CodeSearchNetCCRetrieval", |
| 614 | + "COIRCodeSearchNetRetrieval", |
| 615 | + "AppsRetrieval", |
| 616 | + "SyntheticText2SQL", |
| 617 | + "CosQA", |
| 618 | + "CodeFeedbackMT", |
| 619 | + "CodeFeedbackST", |
| 620 | + "StackOverflowQA", |
| 621 | + "CodeTransOceanContest", |
| 622 | + "CodeTransOceanDL", |
| 623 | + "CodeSearchNetRetrieval", |
| 624 | + "CodeSearchNetCCRetrieval", |
| 625 | + "COIRCodeSearchNetRetrieval", |
493 | 626 | }, |
494 | | - citation="""@inproceedings{reimers-2019-sentence-bert, |
495 | | - title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks", |
496 | | - author = "Reimers, Nils and Gurevych, Iryna", |
497 | | - booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing", |
498 | | - month = "11", |
499 | | - year = "2019", |
500 | | - publisher = "Association for Computational Linguistics", |
501 | | - url = "https://arxiv.org/abs/1908.10084" |
| 627 | + citation="""@misc{LateOn-Code, |
| 628 | + title = {LateOn-Code: a Family of State-Of-The-Art Late Interaction Code Retrieval Models}, |
| 629 | + author = {Chaffin, Antoine}, |
| 630 | + url = {https://huggingface.co/collections/lightonai/lateon-code}, |
| 631 | + year = {2026} |
| 632 | +}""", |
| 633 | +) |
| 634 | + |
| 635 | +lightonai__late_on_code_edge_pretrain = ModelMeta( |
| 636 | + loader=MultiVectorModel, |
| 637 | + name="lightonai/LateOn-Code-edge-pretrain", |
| 638 | + model_type=["late-interaction"], |
| 639 | + languages=[ |
| 640 | + "eng-Latn", |
| 641 | + "python-Code", |
| 642 | + "go-Code", |
| 643 | + "java-Code", |
| 644 | + "javascript-Code", |
| 645 | + "ruby-Code", |
| 646 | + "php-Code", |
| 647 | + ], |
| 648 | + open_weights=True, |
| 649 | + revision="4ca3a44b3093e72d48461aa6a67cfd5c0025c007", |
| 650 | + public_training_code="https://github.com/lightonai/pylate/blob/main/examples/train/lateon_code/pre-training.py", |
| 651 | + public_training_data="https://huggingface.co/datasets/lightonai/cornstack", |
| 652 | + release_date="2026-02-12", |
| 653 | + n_parameters=int(17 * 1e6), |
| 654 | + n_embedding_parameters=12894720, |
| 655 | + memory_usage_mb=64, |
| 656 | + max_tokens=7999, |
| 657 | + embed_dim=48, |
| 658 | + license="apache-2.0", |
| 659 | + similarity_fn_name="MaxSim", |
| 660 | + framework=["PyLate", "ColBERT", "safetensors", "Sentence Transformers"], |
| 661 | + reference="https://huggingface.co/lightonai/LateOn-Code-edge-pretrain", |
| 662 | + use_instructions=False, |
| 663 | + adapted_from="mixedbread-ai/mxbai-edge-colbert-v0-17m", |
| 664 | + superseded_by=None, |
| 665 | + training_datasets={ |
| 666 | + "MSMARCO", |
| 667 | + "NQ", |
| 668 | + "HotpotQA", |
| 669 | + "AmazonQA", |
| 670 | + "LoTTE", |
| 671 | + "MultiLongDocRetrieval", |
| 672 | + "CornStack", |
| 673 | + }, |
| 674 | + citation="""@misc{LateOn-Code, |
| 675 | + title = {LateOn-Code: a Family of State-Of-The-Art Late Interaction Code Retrieval Models}, |
| 676 | + author = {Chaffin, Antoine}, |
| 677 | + url = {https://huggingface.co/collections/lightonai/lateon-code}, |
| 678 | + year = {2026} |
| 679 | +}""", |
| 680 | +) |
| 681 | + |
| 682 | + |
| 683 | +lightonai__late_on_code_edge = ModelMeta( |
| 684 | + loader=MultiVectorModel, |
| 685 | + name="lightonai/LateOn-Code-edge", |
| 686 | + model_type=["late-interaction"], |
| 687 | + languages=[ |
| 688 | + "eng-Latn", |
| 689 | + "python-Code", |
| 690 | + "go-Code", |
| 691 | + "java-Code", |
| 692 | + "javascript-Code", |
| 693 | + "ruby-Code", |
| 694 | + "php-Code", |
| 695 | + ], |
| 696 | + open_weights=True, |
| 697 | + revision="07ef20f406c86badca122464808f4cac2f6e4b25", |
| 698 | + public_training_code="https://github.com/lightonai/pylate/blob/main/examples/train/lateon_code/fine_tuning.py", |
| 699 | + public_training_data="https://huggingface.co/datasets/lightonai/nv-embed-supervised-distill-dedup-code", |
| 700 | + release_date="2026-02-12", |
| 701 | + n_parameters=int(17 * 1e6), |
| 702 | + n_embedding_parameters=12894720, |
| 703 | + memory_usage_mb=64, |
| 704 | + max_tokens=7999, |
| 705 | + embed_dim=48, |
| 706 | + license="apache-2.0", |
| 707 | + similarity_fn_name="MaxSim", |
| 708 | + framework=["PyLate", "ColBERT", "safetensors", "Sentence Transformers"], |
| 709 | + reference="https://huggingface.co/lightonai/LateOn-Code-edge", |
| 710 | + use_instructions=False, |
| 711 | + adapted_from="lightonai/LateOn-Code-edge-pretrain", |
| 712 | + superseded_by=None, |
| 713 | + training_datasets={ |
| 714 | + "MSMARCO", |
| 715 | + "NQ", |
| 716 | + "HotpotQA", |
| 717 | + "AmazonQA", |
| 718 | + "LoTTE", |
| 719 | + "MultiLongDocRetrieval", |
| 720 | + "CornStack", |
| 721 | + "AppsRetrieval", |
| 722 | + "SyntheticText2SQL", |
| 723 | + "CosQA", |
| 724 | + "CodeFeedbackMT", |
| 725 | + "CodeFeedbackST", |
| 726 | + "StackOverflowQA", |
| 727 | + "CodeTransOceanContest", |
| 728 | + "CodeTransOceanDL", |
| 729 | + "CodeSearchNetRetrieval", |
| 730 | + "CodeSearchNetCCRetrieval", |
| 731 | + "COIRCodeSearchNetRetrieval", |
| 732 | + }, |
| 733 | + citation="""@misc{LateOn-Code, |
| 734 | + title = {LateOn-Code: a Family of State-Of-The-Art Late Interaction Code Retrieval Models}, |
| 735 | + author = {Chaffin, Antoine}, |
| 736 | + url = {https://huggingface.co/collections/lightonai/lateon-code}, |
| 737 | + year = {2026} |
502 | 738 | }""", |
503 | 739 | ) |
0 commit comments