Skip to content

Commit 9823529

Browse files
fix: Added Misc Chinese models (#1819)
* Added moka and piccolo models to overview file * Added Text2Vec models * Added various Chinese embedding models --------- Co-authored-by: Isaac Chung <chungisaac1217@gmail.com>
1 parent 3b2d074 commit 9823529

File tree

3 files changed

+196
-0
lines changed

3 files changed

+196
-0
lines changed

mteb/models/misc_models.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1738,3 +1738,91 @@
17381738
training_datasets=None, # They don't specify
17391739
superseded_by=None,
17401740
)
1741+
xiaobu_embedding = ModelMeta(
1742+
name="lier007/xiaobu-embedding",
1743+
revision="59c79d82eb5223cd9895f6eb8e825c7fa10e4e92",
1744+
release_date="2024-01-09",
1745+
languages=["zho_Hans"],
1746+
loader=None,
1747+
n_parameters=326 * 1e6,
1748+
memory_usage=None,
1749+
max_tokens=512,
1750+
embed_dim=1024,
1751+
license="not specified",
1752+
open_weights=True,
1753+
public_training_data=False,
1754+
public_training_code=None,
1755+
framework=["PyTorch", "Sentence Transformers"],
1756+
reference="https://huggingface.co/lier007/xiaobu-embedding",
1757+
similarity_fn_name="cosine",
1758+
use_instructions=None,
1759+
training_datasets=None, # Finetuned from GTE, none of them disclose training data
1760+
superseded_by=None,
1761+
adapted_from="thenlper/gte-large-zh",
1762+
)
1763+
xiaobu_embedding_v2 = ModelMeta(
1764+
name="lier007/xiaobu-embedding-v2",
1765+
revision="1912f2e59a5c2ef802a471d735a38702a5c9485e",
1766+
release_date="2024-06-30",
1767+
languages=["zho_Hans"],
1768+
loader=None,
1769+
n_parameters=326 * 1e6,
1770+
memory_usage=None,
1771+
max_tokens=512,
1772+
embed_dim=768,
1773+
license="not specified",
1774+
open_weights=True,
1775+
public_training_data=False,
1776+
public_training_code=None,
1777+
framework=["PyTorch", "Sentence Transformers"],
1778+
reference="https://huggingface.co/lier007/xiaobu-embedding-v2",
1779+
similarity_fn_name="cosine",
1780+
use_instructions=None,
1781+
training_datasets=None, # Finetuned from piccolo-embedding, none of them say
1782+
superseded_by=None,
1783+
adapted_from="sensenova/piccolo-base-zh",
1784+
)
1785+
yinka_embedding = ModelMeta(
1786+
name="Classical/Yinka",
1787+
revision="59c79d82eb5223cd9895f6eb8e825c7fa10e4e92",
1788+
release_date="2024-01-09",
1789+
languages=["zho_Hans"],
1790+
loader=None,
1791+
n_parameters=326 * 1e6,
1792+
memory_usage=None,
1793+
max_tokens=512,
1794+
embed_dim=1024,
1795+
license="not specified",
1796+
open_weights=True,
1797+
public_training_data=False,
1798+
public_training_code=None,
1799+
framework=["PyTorch", "Sentence Transformers"],
1800+
reference="https://huggingface.co/Classical/Yinka",
1801+
similarity_fn_name="cosine",
1802+
use_instructions=None,
1803+
training_datasets=None, # Not disclosed
1804+
superseded_by=None,
1805+
adapted_from="dunzhang/stella-mrl-large-zh-v3.5-1792d",
1806+
)
1807+
conan_embedding = ModelMeta(
1808+
name="TencentBAC/Conan-embedding-v1",
1809+
revision="bb9749a57d4f02fd71722386f8d0f5a9398d7eeb",
1810+
release_date="2024-08-22",
1811+
languages=["zho_Hans"],
1812+
loader=None,
1813+
n_parameters=326 * 1e6,
1814+
memory_usage=None,
1815+
max_tokens=512,
1816+
embed_dim=768,
1817+
license="cc-by-nc-4.0",
1818+
open_weights=True,
1819+
public_training_data=False,
1820+
public_training_code=None,
1821+
framework=["PyTorch", "Sentence Transformers"],
1822+
reference="https://huggingface.co/Classical/Yinka",
1823+
similarity_fn_name="cosine",
1824+
use_instructions=None,
1825+
# source: https://arxiv.org/pdf/2408.15710
1826+
training_datasets=None, # They "scraped" things from the internet, we don't know, could be leakage
1827+
superseded_by=None,
1828+
)

mteb/models/overview.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
nomic_models,
3737
nvidia_models,
3838
openai_models,
39+
piccolo_models,
3940
promptriever_models,
4041
repllama_models,
4142
rerankers_custom,
@@ -44,6 +45,7 @@
4445
salesforce_models,
4546
sentence_transformers_models,
4647
stella_models,
48+
text2vec_models,
4749
uae_models,
4850
voyage_models,
4951
)
@@ -69,11 +71,13 @@
6971
llm2vec_models,
7072
mxbai_models,
7173
model2vec_models,
74+
moka_models,
7275
misc_models,
7376
nomic_models,
7477
no_instruct_sentence_models,
7578
nvidia_models,
7679
openai_models,
80+
piccolo_models,
7781
promptriever_models,
7882
repllama_models,
7983
rerankers_custom,
@@ -88,6 +92,7 @@
8892
jina_models,
8993
jasper_models,
9094
uae_models,
95+
text2vec_models,
9196
stella_models,
9297
uae_models,
9398
voyage_models,

mteb/models/text2vec_models.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
"""Implementation of Text2Vec models"""
2+
3+
from __future__ import annotations
4+
5+
from mteb.model_meta import ModelMeta
6+
7+
# I couldn't find the large model on HF for some reason
8+
text2vec_base_chinese = ModelMeta(
9+
name="shibing624/text2vec-base-chinese",
10+
languages=["zho-Hans"],
11+
open_weights=True,
12+
revision="183bb99aa7af74355fb58d16edf8c13ae7c5433e",
13+
release_date="2022-01-23",
14+
n_parameters=102 * 1e6,
15+
memory_usage=None,
16+
embed_dim=768,
17+
license="apache-2.0",
18+
max_tokens=512,
19+
reference="https://huggingface.co/shibing624/text2vec-base-chinese",
20+
similarity_fn_name="cosine",
21+
framework=["Sentence Transformers", "PyTorch"],
22+
use_instructions=False,
23+
superseded_by=None,
24+
adapted_from=None,
25+
public_training_code=False, # Couldn't find it
26+
public_training_data=True,
27+
training_datasets={
28+
# source: https://huggingface.co/shibing624/text2vec-base-chinese
29+
# Not in MTEB
30+
# - shibing624/nli-zh-all/text2vec-base-chinese-sentence-dataset
31+
# (Could have overlaps I'm not aware of)
32+
},
33+
)
34+
35+
text2vec_base_chinese_paraphrase = ModelMeta(
36+
name="shibing624/text2vec-base-chinese-paraphrase",
37+
languages=["zho-Hans"],
38+
open_weights=True,
39+
revision="e90c150a9c7fb55a67712a766d6820c55fb83cdd",
40+
release_date="2023-06-19",
41+
n_parameters=118 * 1e6,
42+
memory_usage=None,
43+
embed_dim=768,
44+
license="apache-2.0",
45+
max_tokens=512,
46+
reference="https://huggingface.co/shibing624/text2vec-base-chinese-paraphrase",
47+
similarity_fn_name="cosine",
48+
framework=["Sentence Transformers", "PyTorch"],
49+
use_instructions=False,
50+
superseded_by=None,
51+
adapted_from=None,
52+
public_training_code=False, # Couldn't find it
53+
public_training_data=True,
54+
training_datasets={
55+
# source: https://huggingface.co/shibing624/text2vec-base-chinese
56+
# Not in MTEB
57+
# - shibing624/nli-zh-all/text2vec-base-chinese-paraphrase
58+
# (Could have overlaps I'm not aware of)
59+
},
60+
)
61+
62+
63+
text2vec_multi_langs = [
64+
"deu-Latn", # German (de)
65+
"eng-Latn", # English (en)
66+
"spa-Latn", # Spanish (es)
67+
"fra-Latn", # French (fr)
68+
"ita-Latn", # Italian (it)
69+
"nld-Latn", # Dutch (nl)
70+
"pol-Latn", # Polish (pl)
71+
"por-Latn", # Portuguese (pt)
72+
"rus-Cyrl", # Russian (ru)
73+
"zho-Hans", # Chinese (Simplified, zh)
74+
]
75+
text2vec_base_multilingual = ModelMeta(
76+
name="shibing624/text2vec-base-multilingual",
77+
languages=text2vec_multi_langs,
78+
open_weights=True,
79+
revision="6633dc49e554de7105458f8f2e96445c6598e9d1",
80+
release_date="2023-06-22",
81+
# While it can be loaded with SBERT, it has one suspicious file according to huggingface
82+
# So probably best not to.
83+
loader=None,
84+
n_parameters=118 * 1e6,
85+
memory_usage=None,
86+
embed_dim=384,
87+
license="apache-2.0",
88+
max_tokens=256,
89+
reference="https://huggingface.co/shibing624/text2vec-base-chinese-paraphrase",
90+
similarity_fn_name="cosine",
91+
framework=["Sentence Transformers", "PyTorch"],
92+
use_instructions=False,
93+
superseded_by=None,
94+
adapted_from="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
95+
public_training_code=False, # Couldn't find it
96+
public_training_data=True,
97+
training_datasets={
98+
# source: https://huggingface.co/shibing624/text2vec-base-chinese
99+
# Not in MTEB
100+
# - shibing624/nli-zh-all/tree/main/text2vec-base-multilingual-dataset
101+
# # (Could have overlaps I'm not aware of)
102+
},
103+
)

0 commit comments

Comments
 (0)