Skip to content

Commit 4345e63

Browse files
NohTowSamoed
andauthored
model: LateOn-Code models definition (#4175)
* First draft of LateOn code models definition * Fix reference for LateOn-Code * Fix reference LateOn code edge pretrain * Add memory_usage_mb (and embed_dim) * fix lint * Add training datasets --------- Co-authored-by: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
1 parent 3ab29f4 commit 4345e63

File tree

2 files changed

+248
-15
lines changed

2 files changed

+248
-15
lines changed

mteb/models/model_implementations/mixedbread_ai_models.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,5 @@
11
from mteb.models.model_implementations.pylate_models import MultiVectorModel
2-
from mteb.models.model_meta import (
3-
ModelMeta,
4-
ScoringFunction,
5-
)
2+
from mteb.models.model_meta import ModelMeta, ScoringFunction
63
from mteb.models.sentence_transformer_wrapper import (
74
CrossEncoderWrapper,
85
sentence_transformers_loader,

mteb/models/model_implementations/pylate_models.py

Lines changed: 247 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,7 @@
77
from pathlib import Path
88
from typing import TYPE_CHECKING, Any
99

10-
from mteb._create_dataloaders import (
11-
create_dataloader,
12-
)
10+
from mteb._create_dataloaders import create_dataloader
1311
from mteb._requires_package import requires_package
1412
from mteb.models.abs_encoder import AbsEncoder, get_prompt
1513
from mteb.models.model_meta import ModelMeta, ScoringFunction
@@ -487,17 +485,255 @@ def _encode(
487485
use_instructions=False,
488486
adapted_from="Alibaba-NLP/gte-modernbert-base",
489487
superseded_by=None,
488+
training_datasets={
489+
"MSMARCO",
490+
"MSMARCOHardNegatives",
491+
"NanoMSMARCORetrieval",
492+
"NQ",
493+
"NQHardNegatives",
494+
"NanoNQRetrieval",
495+
"HotpotQA",
496+
"HotpotQAHardNegatives",
497+
"CodeSearchNet",
498+
"FEVER",
499+
"DBPedia",
500+
"DBPediaHardNegatives.v2",
501+
"NanoDBPediaRetrieval",
502+
"TRECDL2019",
503+
"TRECDL2020",
504+
"CornStack",
505+
},
506+
citation="""@misc{GTE-ModernColBERT,
507+
title={GTE-ModernColBERT},
508+
author={Chaffin, Antoine},
509+
url={https://huggingface.co/lightonai/GTE-ModernColBERT-v1},
510+
year={2025}
511+
}""",
512+
)
513+
514+
lightonai__late_on_code_pretrain = ModelMeta(
515+
loader=MultiVectorModel,
516+
name="lightonai/LateOn-Code-pretrain",
517+
model_type=["late-interaction"],
518+
languages=[
519+
"eng-Latn",
520+
"python-Code",
521+
"go-Code",
522+
"java-Code",
523+
"javascript-Code",
524+
"ruby-Code",
525+
"php-Code",
526+
],
527+
open_weights=True,
528+
revision="71251a6ee61eee488de7e3ae29f5fb4c3c94699b",
529+
public_training_code="https://github.com/lightonai/pylate/blob/main/examples/train/lateon_code/pre-training.py",
530+
public_training_data="https://huggingface.co/datasets/lightonai/cornstack",
531+
release_date="2026-02-12",
532+
n_parameters=int(149 * 1e6),
533+
n_embedding_parameters=38684160,
534+
memory_usage_mb=568,
535+
max_tokens=8192,
536+
embed_dim=128,
537+
license="apache-2.0",
538+
similarity_fn_name="MaxSim",
539+
framework=["PyLate", "ColBERT", "safetensors", "Sentence Transformers"],
540+
reference="https://huggingface.co/lightonai/LateOn-Code-pretrain",
541+
use_instructions=False,
542+
superseded_by=None,
543+
training_datasets={
544+
"MSMARCO",
545+
"MSMARCOHardNegatives",
546+
"NanoMSMARCORetrieval",
547+
"NQ",
548+
"NQHardNegatives",
549+
"NanoNQRetrieval",
550+
"HotpotQA",
551+
"HotpotQAHardNegatives",
552+
"CodeSearchNet",
553+
"FEVER",
554+
"DBPedia",
555+
"DBPediaHardNegatives.v2",
556+
"NanoDBPediaRetrieval",
557+
"TRECDL2019",
558+
"TRECDL2020",
559+
"CornStack",
560+
},
561+
citation="""@misc{LateOn-Code,
562+
title = {LateOn-Code: a Family of State-Of-The-Art Late Interaction Code Retrieval Models},
563+
author = {Chaffin, Antoine},
564+
url = {https://huggingface.co/collections/lightonai/lateon-code},
565+
year = {2026}
566+
}""",
567+
)
568+
569+
570+
lightonai__late_on_code = ModelMeta(
571+
loader=MultiVectorModel,
572+
name="lightonai/LateOn-Code",
573+
model_type=["late-interaction"],
574+
languages=[
575+
"eng-Latn",
576+
"python-Code",
577+
"go-Code",
578+
"java-Code",
579+
"javascript-Code",
580+
"ruby-Code",
581+
"php-Code",
582+
],
583+
open_weights=True,
584+
revision="734b659a57935ef50562d79581c3ff1f8d825c93",
585+
public_training_code="https://github.com/lightonai/pylate/blob/main/examples/train/lateon_code/fine_tuning.py",
586+
public_training_data="https://huggingface.co/datasets/lightonai/nv-embed-supervised-distill-dedup-code",
587+
release_date="2026-02-12",
588+
n_parameters=int(149 * 1e6),
589+
n_embedding_parameters=38684160,
590+
memory_usage_mb=568,
591+
max_tokens=8192,
592+
embed_dim=128,
593+
license="apache-2.0",
594+
similarity_fn_name="MaxSim",
595+
framework=["PyLate", "ColBERT", "safetensors", "Sentence Transformers"],
596+
reference="https://huggingface.co/lightonai/LateOn-Code",
597+
use_instructions=False,
598+
adapted_from="lightonai/LateOn-Code-pretrain",
599+
superseded_by=None,
490600
training_datasets={
491601
"MSMARCO",
492602
"mMARCO-NL",
603+
"CornStack",
604+
"AppsRetrieval",
605+
"SyntheticText2SQL",
606+
"CosQA",
607+
"CodeFeedbackMT",
608+
"CodeFeedbackST",
609+
"StackOverflowQA",
610+
"CodeTransOceanContest",
611+
"CodeTransOceanDL",
612+
"CodeSearchNetRetrieval",
613+
"CodeSearchNetCCRetrieval",
614+
"COIRCodeSearchNetRetrieval",
615+
"AppsRetrieval",
616+
"SyntheticText2SQL",
617+
"CosQA",
618+
"CodeFeedbackMT",
619+
"CodeFeedbackST",
620+
"StackOverflowQA",
621+
"CodeTransOceanContest",
622+
"CodeTransOceanDL",
623+
"CodeSearchNetRetrieval",
624+
"CodeSearchNetCCRetrieval",
625+
"COIRCodeSearchNetRetrieval",
493626
},
494-
citation="""@inproceedings{reimers-2019-sentence-bert,
495-
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
496-
author = "Reimers, Nils and Gurevych, Iryna",
497-
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
498-
month = "11",
499-
year = "2019",
500-
publisher = "Association for Computational Linguistics",
501-
url = "https://arxiv.org/abs/1908.10084"
627+
citation="""@misc{LateOn-Code,
628+
title = {LateOn-Code: a Family of State-Of-The-Art Late Interaction Code Retrieval Models},
629+
author = {Chaffin, Antoine},
630+
url = {https://huggingface.co/collections/lightonai/lateon-code},
631+
year = {2026}
632+
}""",
633+
)
634+
635+
lightonai__late_on_code_edge_pretrain = ModelMeta(
636+
loader=MultiVectorModel,
637+
name="lightonai/LateOn-Code-edge-pretrain",
638+
model_type=["late-interaction"],
639+
languages=[
640+
"eng-Latn",
641+
"python-Code",
642+
"go-Code",
643+
"java-Code",
644+
"javascript-Code",
645+
"ruby-Code",
646+
"php-Code",
647+
],
648+
open_weights=True,
649+
revision="4ca3a44b3093e72d48461aa6a67cfd5c0025c007",
650+
public_training_code="https://github.com/lightonai/pylate/blob/main/examples/train/lateon_code/pre-training.py",
651+
public_training_data="https://huggingface.co/datasets/lightonai/cornstack",
652+
release_date="2026-02-12",
653+
n_parameters=int(17 * 1e6),
654+
n_embedding_parameters=12894720,
655+
memory_usage_mb=64,
656+
max_tokens=7999,
657+
embed_dim=48,
658+
license="apache-2.0",
659+
similarity_fn_name="MaxSim",
660+
framework=["PyLate", "ColBERT", "safetensors", "Sentence Transformers"],
661+
reference="https://huggingface.co/lightonai/LateOn-Code-edge-pretrain",
662+
use_instructions=False,
663+
adapted_from="mixedbread-ai/mxbai-edge-colbert-v0-17m",
664+
superseded_by=None,
665+
training_datasets={
666+
"MSMARCO",
667+
"NQ",
668+
"HotpotQA",
669+
"AmazonQA",
670+
"LoTTE",
671+
"MultiLongDocRetrieval",
672+
"CornStack",
673+
},
674+
citation="""@misc{LateOn-Code,
675+
title = {LateOn-Code: a Family of State-Of-The-Art Late Interaction Code Retrieval Models},
676+
author = {Chaffin, Antoine},
677+
url = {https://huggingface.co/collections/lightonai/lateon-code},
678+
year = {2026}
679+
}""",
680+
)
681+
682+
683+
lightonai__late_on_code_edge = ModelMeta(
684+
loader=MultiVectorModel,
685+
name="lightonai/LateOn-Code-edge",
686+
model_type=["late-interaction"],
687+
languages=[
688+
"eng-Latn",
689+
"python-Code",
690+
"go-Code",
691+
"java-Code",
692+
"javascript-Code",
693+
"ruby-Code",
694+
"php-Code",
695+
],
696+
open_weights=True,
697+
revision="07ef20f406c86badca122464808f4cac2f6e4b25",
698+
public_training_code="https://github.com/lightonai/pylate/blob/main/examples/train/lateon_code/fine_tuning.py",
699+
public_training_data="https://huggingface.co/datasets/lightonai/nv-embed-supervised-distill-dedup-code",
700+
release_date="2026-02-12",
701+
n_parameters=int(17 * 1e6),
702+
n_embedding_parameters=12894720,
703+
memory_usage_mb=64,
704+
max_tokens=7999,
705+
embed_dim=48,
706+
license="apache-2.0",
707+
similarity_fn_name="MaxSim",
708+
framework=["PyLate", "ColBERT", "safetensors", "Sentence Transformers"],
709+
reference="https://huggingface.co/lightonai/LateOn-Code-edge",
710+
use_instructions=False,
711+
adapted_from="lightonai/LateOn-Code-edge-pretrain",
712+
superseded_by=None,
713+
training_datasets={
714+
"MSMARCO",
715+
"NQ",
716+
"HotpotQA",
717+
"AmazonQA",
718+
"LoTTE",
719+
"MultiLongDocRetrieval",
720+
"CornStack",
721+
"AppsRetrieval",
722+
"SyntheticText2SQL",
723+
"CosQA",
724+
"CodeFeedbackMT",
725+
"CodeFeedbackST",
726+
"StackOverflowQA",
727+
"CodeTransOceanContest",
728+
"CodeTransOceanDL",
729+
"CodeSearchNetRetrieval",
730+
"CodeSearchNetCCRetrieval",
731+
"COIRCodeSearchNetRetrieval",
732+
},
733+
citation="""@misc{LateOn-Code,
734+
title = {LateOn-Code: a Family of State-Of-The-Art Late Interaction Code Retrieval Models},
735+
author = {Chaffin, Antoine},
736+
url = {https://huggingface.co/collections/lightonai/lateon-code},
737+
year = {2026}
502738
}""",
503739
)

0 commit comments

Comments
 (0)