model: LateOn-Code models definition (#4175)

NohTow · Samoed · web-flow · commit 4345e63c42d6 · 2026-03-02T15:06:22.000+03:00
* First draft of LateOn code models definition

* Fix reference for LateOn-Code

* Fix reference LateOn code edge pretrain

* Add memory_usage_mb (and embed_dim)

* fix lint

* Add training datasets

---------

Co-authored-by: Roman Solomatin &lt;36135455+Samoed@users.noreply.github.com&gt;
diff --git a/mteb/models/model_implementations/mixedbread_ai_models.py b/mteb/models/model_implementations/mixedbread_ai_models.py
@@ -1,8 +1,5 @@
 from mteb.models.model_implementations.pylate_models import MultiVectorModel
-from mteb.models.model_meta import (
-    ModelMeta,
-    ScoringFunction,
-)
+from mteb.models.model_meta import ModelMeta, ScoringFunction
 from mteb.models.sentence_transformer_wrapper import (
     CrossEncoderWrapper,
     sentence_transformers_loader,
diff --git a/mteb/models/model_implementations/pylate_models.py b/mteb/models/model_implementations/pylate_models.py
@@ -7,9 +7,7 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
 
-from mteb._create_dataloaders import (
-    create_dataloader,
-)
+from mteb._create_dataloaders import create_dataloader
 from mteb._requires_package import requires_package
 from mteb.models.abs_encoder import AbsEncoder, get_prompt
 from mteb.models.model_meta import ModelMeta, ScoringFunction
@@ -487,17 +485,255 @@ def _encode(
     use_instructions=False,
     adapted_from="Alibaba-NLP/gte-modernbert-base",
     superseded_by=None,
+    training_datasets={
+        "MSMARCO",
+        "MSMARCOHardNegatives",
+        "NanoMSMARCORetrieval",
+        "NQ",
+        "NQHardNegatives",
+        "NanoNQRetrieval",
+        "HotpotQA",
+        "HotpotQAHardNegatives",
+        "CodeSearchNet",
+        "FEVER",
+        "DBPedia",
+        "DBPediaHardNegatives.v2",
+        "NanoDBPediaRetrieval",
+        "TRECDL2019",
+        "TRECDL2020",
+        "CornStack",
+    },
+    citation="""@misc{GTE-ModernColBERT,
+    title={GTE-ModernColBERT},
+    author={Chaffin, Antoine},
+    url={https://huggingface.co/lightonai/GTE-ModernColBERT-v1},
+    year={2025}
+}""",
+)
+
+lightonai__late_on_code_pretrain = ModelMeta(
+    loader=MultiVectorModel,
+    name="lightonai/LateOn-Code-pretrain",
+    model_type=["late-interaction"],
+    languages=[
+        "eng-Latn",
+        "python-Code",
+        "go-Code",
+        "java-Code",
+        "javascript-Code",
+        "ruby-Code",
+        "php-Code",
+    ],
+    open_weights=True,
+    revision="71251a6ee61eee488de7e3ae29f5fb4c3c94699b",
+    public_training_code="https://github.com/lightonai/pylate/blob/main/examples/train/lateon_code/pre-training.py",
+    public_training_data="https://huggingface.co/datasets/lightonai/cornstack",
+    release_date="2026-02-12",
+    n_parameters=int(149 * 1e6),
+    n_embedding_parameters=38684160,
+    memory_usage_mb=568,
+    max_tokens=8192,
+    embed_dim=128,
+    license="apache-2.0",
+    similarity_fn_name="MaxSim",
+    framework=["PyLate", "ColBERT", "safetensors", "Sentence Transformers"],
+    reference="https://huggingface.co/lightonai/LateOn-Code-pretrain",
+    use_instructions=False,
+    superseded_by=None,
+    training_datasets={
+        "MSMARCO",
+        "MSMARCOHardNegatives",
+        "NanoMSMARCORetrieval",
+        "NQ",
+        "NQHardNegatives",
+        "NanoNQRetrieval",
+        "HotpotQA",
+        "HotpotQAHardNegatives",
+        "CodeSearchNet",
+        "FEVER",
+        "DBPedia",
+        "DBPediaHardNegatives.v2",
+        "NanoDBPediaRetrieval",
+        "TRECDL2019",
+        "TRECDL2020",
+        "CornStack",
+    },
+    citation="""@misc{LateOn-Code,
+  title  = {LateOn-Code: a Family of State-Of-The-Art Late Interaction Code Retrieval Models},
+  author = {Chaffin, Antoine},
+  url    = {https://huggingface.co/collections/lightonai/lateon-code},
+  year   = {2026}
+}""",
+)
+
+
+lightonai__late_on_code = ModelMeta(
+    loader=MultiVectorModel,
+    name="lightonai/LateOn-Code",
+    model_type=["late-interaction"],
+    languages=[
+        "eng-Latn",
+        "python-Code",
+        "go-Code",
+        "java-Code",
+        "javascript-Code",
+        "ruby-Code",
+        "php-Code",
+    ],
+    open_weights=True,
+    revision="734b659a57935ef50562d79581c3ff1f8d825c93",
+    public_training_code="https://github.com/lightonai/pylate/blob/main/examples/train/lateon_code/fine_tuning.py",
+    public_training_data="https://huggingface.co/datasets/lightonai/nv-embed-supervised-distill-dedup-code",
+    release_date="2026-02-12",
+    n_parameters=int(149 * 1e6),
+    n_embedding_parameters=38684160,
+    memory_usage_mb=568,
+    max_tokens=8192,
+    embed_dim=128,
+    license="apache-2.0",
+    similarity_fn_name="MaxSim",
+    framework=["PyLate", "ColBERT", "safetensors", "Sentence Transformers"],
+    reference="https://huggingface.co/lightonai/LateOn-Code",
+    use_instructions=False,
+    adapted_from="lightonai/LateOn-Code-pretrain",
+    superseded_by=None,
     training_datasets={
         "MSMARCO",
         "mMARCO-NL",
+        "CornStack",
+        "AppsRetrieval",
+        "SyntheticText2SQL",
+        "CosQA",
+        "CodeFeedbackMT",
+        "CodeFeedbackST",
+        "StackOverflowQA",
+        "CodeTransOceanContest",
+        "CodeTransOceanDL",
+        "CodeSearchNetRetrieval",
+        "CodeSearchNetCCRetrieval",
+        "COIRCodeSearchNetRetrieval",
+        "AppsRetrieval",
+        "SyntheticText2SQL",
+        "CosQA",
+        "CodeFeedbackMT",
+        "CodeFeedbackST",
+        "StackOverflowQA",
+        "CodeTransOceanContest",
+        "CodeTransOceanDL",
+        "CodeSearchNetRetrieval",
+        "CodeSearchNetCCRetrieval",
+        "COIRCodeSearchNetRetrieval",
     },
-    citation="""@inproceedings{reimers-2019-sentence-bert,
-    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
-    author = "Reimers, Nils and Gurevych, Iryna",
-    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
-    month = "11",
-    year = "2019",
-    publisher = "Association for Computational Linguistics",
-    url = "https://arxiv.org/abs/1908.10084"
+    citation="""@misc{LateOn-Code,
+  title  = {LateOn-Code: a Family of State-Of-The-Art Late Interaction Code Retrieval Models},
+  author = {Chaffin, Antoine},
+  url    = {https://huggingface.co/collections/lightonai/lateon-code},
+  year   = {2026}
+}""",
+)
+
+lightonai__late_on_code_edge_pretrain = ModelMeta(
+    loader=MultiVectorModel,
+    name="lightonai/LateOn-Code-edge-pretrain",
+    model_type=["late-interaction"],
+    languages=[
+        "eng-Latn",
+        "python-Code",
+        "go-Code",
+        "java-Code",
+        "javascript-Code",
+        "ruby-Code",
+        "php-Code",
+    ],
+    open_weights=True,
+    revision="4ca3a44b3093e72d48461aa6a67cfd5c0025c007",
+    public_training_code="https://github.com/lightonai/pylate/blob/main/examples/train/lateon_code/pre-training.py",
+    public_training_data="https://huggingface.co/datasets/lightonai/cornstack",
+    release_date="2026-02-12",
+    n_parameters=int(17 * 1e6),
+    n_embedding_parameters=12894720,
+    memory_usage_mb=64,
+    max_tokens=7999,
+    embed_dim=48,
+    license="apache-2.0",
+    similarity_fn_name="MaxSim",
+    framework=["PyLate", "ColBERT", "safetensors", "Sentence Transformers"],
+    reference="https://huggingface.co/lightonai/LateOn-Code-edge-pretrain",
+    use_instructions=False,
+    adapted_from="mixedbread-ai/mxbai-edge-colbert-v0-17m",
+    superseded_by=None,
+    training_datasets={
+        "MSMARCO",
+        "NQ",
+        "HotpotQA",
+        "AmazonQA",
+        "LoTTE",
+        "MultiLongDocRetrieval",
+        "CornStack",
+    },
+    citation="""@misc{LateOn-Code,
+  title  = {LateOn-Code: a Family of State-Of-The-Art Late Interaction Code Retrieval Models},
+  author = {Chaffin, Antoine},
+  url    = {https://huggingface.co/collections/lightonai/lateon-code},
+  year   = {2026}
+}""",
+)
+
+
+lightonai__late_on_code_edge = ModelMeta(
+    loader=MultiVectorModel,
+    name="lightonai/LateOn-Code-edge",
+    model_type=["late-interaction"],
+    languages=[
+        "eng-Latn",
+        "python-Code",
+        "go-Code",
+        "java-Code",
+        "javascript-Code",
+        "ruby-Code",
+        "php-Code",
+    ],
+    open_weights=True,
+    revision="07ef20f406c86badca122464808f4cac2f6e4b25",
+    public_training_code="https://github.com/lightonai/pylate/blob/main/examples/train/lateon_code/fine_tuning.py",
+    public_training_data="https://huggingface.co/datasets/lightonai/nv-embed-supervised-distill-dedup-code",
+    release_date="2026-02-12",
+    n_parameters=int(17 * 1e6),
+    n_embedding_parameters=12894720,
+    memory_usage_mb=64,
+    max_tokens=7999,
+    embed_dim=48,
+    license="apache-2.0",
+    similarity_fn_name="MaxSim",
+    framework=["PyLate", "ColBERT", "safetensors", "Sentence Transformers"],
+    reference="https://huggingface.co/lightonai/LateOn-Code-edge",
+    use_instructions=False,
+    adapted_from="lightonai/LateOn-Code-edge-pretrain",
+    superseded_by=None,
+    training_datasets={
+        "MSMARCO",
+        "NQ",
+        "HotpotQA",
+        "AmazonQA",
+        "LoTTE",
+        "MultiLongDocRetrieval",
+        "CornStack",
+        "AppsRetrieval",
+        "SyntheticText2SQL",
+        "CosQA",
+        "CodeFeedbackMT",
+        "CodeFeedbackST",
+        "StackOverflowQA",
+        "CodeTransOceanContest",
+        "CodeTransOceanDL",
+        "CodeSearchNetRetrieval",
+        "CodeSearchNetCCRetrieval",
+        "COIRCodeSearchNetRetrieval",
+    },
+    citation="""@misc{LateOn-Code,
+  title  = {LateOn-Code: a Family of State-Of-The-Art Late Interaction Code Retrieval Models},
+  author = {Chaffin, Antoine},
+  url    = {https://huggingface.co/collections/lightonai/lateon-code},
+  year   = {2026}
 }""",
 )