|
7 | 7 |
|
8 | 8 | from mteb.abstasks.task_metadata import TaskMetadata |
9 | 9 | from mteb.models.abs_encoder import AbsEncoder |
| 10 | +from mteb.models.instruct_wrapper import InstructSentenceTransformerModel |
| 11 | +from mteb.models.model_implementations.bge_models import ( |
| 12 | + bge_chinese_training_data, |
| 13 | + bge_full_data, |
| 14 | + bge_m3_training_data, |
| 15 | +) |
| 16 | +from mteb.models.model_implementations.e5_instruct import E5_MISTRAL_TRAINING_DATA |
| 17 | +from mteb.models.model_implementations.nvidia_models import nvidia_training_datasets |
| 18 | +from mteb.models.model_implementations.qzhou_models import qzhou_training_data |
10 | 19 | from mteb.models.model_meta import ModelMeta, ScoringFunction |
11 | 20 | from mteb.types import Array, BatchedInput, PromptType |
12 | 21 |
|
13 | | -from .nvidia_models import nvidia_training_datasets |
14 | | - |
15 | 22 | logger = logging.getLogger(__name__) |
16 | 23 |
|
17 | 24 |
|
| 25 | +def instruction_template( |
| 26 | + instruction: str, prompt_type: PromptType | None = None |
| 27 | +) -> str: |
| 28 | + if not instruction or prompt_type == PromptType.document: |
| 29 | + return "" |
| 30 | + if isinstance(instruction, dict): |
| 31 | + if prompt_type is None: |
| 32 | + instruction = "Given a web search query, retrieve relevant passages that answer the query" |
| 33 | + else: |
| 34 | + instruction = instruction[prompt_type] |
| 35 | + return f"Instruct: {instruction}\nQuery:" |
| 36 | + |
| 37 | + |
18 | 38 | class JasperModel(AbsEncoder): |
19 | 39 | def __init__( |
20 | 40 | self, |
@@ -114,3 +134,34 @@ def encode( |
114 | 134 | } |
115 | 135 | """, |
116 | 136 | ) |
| 137 | + |
| 138 | +Jasper_Token_Compression_600M = ModelMeta( |
| 139 | + loader=InstructSentenceTransformerModel, |
| 140 | + loader_kwargs=dict( |
| 141 | + instruction_template=instruction_template, |
| 142 | + apply_instruction_to_passages=False, |
| 143 | + trust_remote_code=True, |
| 144 | + ), |
| 145 | + name="infgrad/Jasper-Token-Compression-600M", |
| 146 | + languages=["eng-Latn", "zho-Hans"], |
| 147 | + open_weights=True, |
| 148 | + revision="06a100f753a5a96d9e583b3af79c6fcdfacc4719", |
| 149 | + release_date="2025-11-14", |
| 150 | + n_parameters=595776512, |
| 151 | + memory_usage_mb=2272, |
| 152 | + embed_dim=2048, |
| 153 | + license="mit", |
| 154 | + max_tokens=32768, |
| 155 | + reference="https://huggingface.co/infgrad/Jasper-Token-Compression-600M", |
| 156 | + similarity_fn_name="cosine", |
| 157 | + framework=["Sentence Transformers", "PyTorch"], |
| 158 | + use_instructions=True, |
| 159 | + public_training_code=None, |
| 160 | + # public_training_data: unsupervised data for distillation |
| 161 | + public_training_data="https://huggingface.co/datasets/infgrad/jasper_text_distill_dataset", |
| 162 | + training_datasets=bge_m3_training_data |
| 163 | + | bge_chinese_training_data |
| 164 | + | bge_full_data |
| 165 | + | E5_MISTRAL_TRAINING_DATA |
| 166 | + | qzhou_training_data, |
| 167 | +) |
0 commit comments