Skip to content

Commit 64459d1

Browse files
authored
fix: add jasper token compression model (#3557)
* add jasper token compression model * add distillation dataset
1 parent 85c5ec3 commit 64459d1

File tree

1 file changed

+53
-2
lines changed

1 file changed

+53
-2
lines changed

mteb/models/model_implementations/jasper_models.py

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,34 @@
77

88
from mteb.abstasks.task_metadata import TaskMetadata
99
from mteb.models.abs_encoder import AbsEncoder
10+
from mteb.models.instruct_wrapper import InstructSentenceTransformerModel
11+
from mteb.models.model_implementations.bge_models import (
12+
bge_chinese_training_data,
13+
bge_full_data,
14+
bge_m3_training_data,
15+
)
16+
from mteb.models.model_implementations.e5_instruct import E5_MISTRAL_TRAINING_DATA
17+
from mteb.models.model_implementations.nvidia_models import nvidia_training_datasets
18+
from mteb.models.model_implementations.qzhou_models import qzhou_training_data
1019
from mteb.models.model_meta import ModelMeta, ScoringFunction
1120
from mteb.types import Array, BatchedInput, PromptType
1221

13-
from .nvidia_models import nvidia_training_datasets
14-
1522
logger = logging.getLogger(__name__)
1623

1724

25+
def instruction_template(
26+
instruction: str, prompt_type: PromptType | None = None
27+
) -> str:
28+
if not instruction or prompt_type == PromptType.document:
29+
return ""
30+
if isinstance(instruction, dict):
31+
if prompt_type is None:
32+
instruction = "Given a web search query, retrieve relevant passages that answer the query"
33+
else:
34+
instruction = instruction[prompt_type]
35+
return f"Instruct: {instruction}\nQuery:"
36+
37+
1838
class JasperModel(AbsEncoder):
1939
def __init__(
2040
self,
@@ -114,3 +134,34 @@ def encode(
114134
}
115135
""",
116136
)
137+
138+
Jasper_Token_Compression_600M = ModelMeta(
139+
loader=InstructSentenceTransformerModel,
140+
loader_kwargs=dict(
141+
instruction_template=instruction_template,
142+
apply_instruction_to_passages=False,
143+
trust_remote_code=True,
144+
),
145+
name="infgrad/Jasper-Token-Compression-600M",
146+
languages=["eng-Latn", "zho-Hans"],
147+
open_weights=True,
148+
revision="06a100f753a5a96d9e583b3af79c6fcdfacc4719",
149+
release_date="2025-11-14",
150+
n_parameters=595776512,
151+
memory_usage_mb=2272,
152+
embed_dim=2048,
153+
license="mit",
154+
max_tokens=32768,
155+
reference="https://huggingface.co/infgrad/Jasper-Token-Compression-600M",
156+
similarity_fn_name="cosine",
157+
framework=["Sentence Transformers", "PyTorch"],
158+
use_instructions=True,
159+
public_training_code=None,
160+
# public_training_data: unsupervised data for distillation
161+
public_training_data="https://huggingface.co/datasets/infgrad/jasper_text_distill_dataset",
162+
training_datasets=bge_m3_training_data
163+
| bge_chinese_training_data
164+
| bge_full_data
165+
| E5_MISTRAL_TRAINING_DATA
166+
| qzhou_training_data,
167+
)

0 commit comments

Comments
 (0)