Skip to content

Commit c76e40c

Browse files
committed
Merge branch 'main' into add_active_parameter
# Conflicts: # mteb/models/model_implementations/colqwen_models.py
2 parents 7896319 + b968433 commit c76e40c

File tree

4 files changed

+43
-51
lines changed

4 files changed

+43
-51
lines changed

mteb/abstasks/abstask.py

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,38 @@
1+
from __future__ import annotations
2+
13
import json
24
import logging
35
import warnings
46
from abc import ABC, abstractmethod
5-
from collections.abc import Mapping, Sequence
7+
from collections.abc import Sequence
68
from copy import copy
79
from pathlib import Path
8-
from typing import Any, Literal, cast
10+
from typing import TYPE_CHECKING, Any, Literal, cast
911

1012
import numpy as np
1113
from datasets import ClassLabel, Dataset, DatasetDict, load_dataset
1214
from sklearn.preprocessing import MultiLabelBinarizer
1315
from tqdm.auto import tqdm
14-
from typing_extensions import Self
1516

1617
from mteb._set_seed import _set_seed
17-
from mteb.abstasks.task_metadata import TaskMetadata
1818
from mteb.languages import LanguageScripts
1919
from mteb.models import (
2020
CrossEncoderProtocol,
2121
EncoderProtocol,
22-
MTEBModels,
2322
SearchProtocol,
2423
)
25-
from mteb.types import HFSubset, Modalities, ScoresDict
26-
from mteb.types._encoder_io import EncodeKwargs
27-
from mteb.types.statistics import DescriptiveStatistics, SplitDescriptiveStatistics
24+
25+
if TYPE_CHECKING:
26+
from collections.abc import Mapping
27+
28+
from typing_extensions import Self
29+
30+
from mteb.abstasks.task_metadata import TaskMetadata
31+
from mteb.models import (
32+
MTEBModels,
33+
)
34+
from mteb.types import EncodeKwargs, HFSubset, Modalities, ScoresDict
35+
from mteb.types.statistics import DescriptiveStatistics, SplitDescriptiveStatistics
2836

2937
logger = logging.getLogger(__name__)
3038

@@ -163,7 +171,7 @@ def evaluate(
163171
if not self.data_loaded:
164172
self.load_data()
165173

166-
self.dataset = cast(dict[HFSubset, DatasetDict], self.dataset)
174+
self.dataset = cast("dict[HFSubset, DatasetDict]", self.dataset)
167175

168176
scores = {}
169177
if self.hf_subsets is None:

mteb/models/model_implementations/colpali_models.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,21 @@
44
from typing import TYPE_CHECKING, Any
55

66
import torch
7-
from torch.utils.data import DataLoader
87
from tqdm.auto import tqdm
98

109
from mteb._requires_package import (
1110
requires_image_dependencies,
1211
requires_package,
1312
)
14-
from mteb.abstasks.task_metadata import TaskMetadata
1513
from mteb.models.abs_encoder import AbsEncoder
1614
from mteb.models.model_meta import ModelMeta, ScoringFunction
17-
from mteb.types import Array, BatchedInput, PromptType
1815

1916
if TYPE_CHECKING:
2017
from PIL import Image
18+
from torch.utils.data import DataLoader
19+
20+
from mteb.abstasks.task_metadata import TaskMetadata
21+
from mteb.types import Array, BatchedInput, PromptType
2122

2223
logger = logging.getLogger(__name__)
2324

mteb/models/model_implementations/colqwen_models.py

Lines changed: 10 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,23 @@
1+
from __future__ import annotations
2+
13
import logging
2-
from typing import Any
4+
from typing import TYPE_CHECKING, Any
35

46
import torch
5-
from torch.utils.data import DataLoader
67
from tqdm.auto import tqdm
78

89
from mteb._requires_package import (
910
requires_image_dependencies,
1011
requires_package,
1112
)
12-
from mteb.abstasks.task_metadata import TaskMetadata
1313
from mteb.models.abs_encoder import AbsEncoder
1414
from mteb.models.model_meta import ModelMeta, ScoringFunction
15-
from mteb.types import Array, BatchedInput, PromptType
15+
16+
if TYPE_CHECKING:
17+
from torch.utils.data import DataLoader
18+
19+
from mteb.abstasks.task_metadata import TaskMetadata
20+
from mteb.types import Array, BatchedInput, PromptType
1621

1722
from .colpali_models import (
1823
COLPALI_CITATION,
@@ -333,33 +338,6 @@ def similarity(self, a, b):
333338
citation=TOMORO_CITATION,
334339
)
335340

336-
colnomic_7b = ModelMeta(
337-
loader=ColQwen2_5Wrapper,
338-
loader_kwargs=dict(
339-
torch_dtype=torch.float16,
340-
),
341-
name="nomic-ai/colnomic-embed-multimodal-7b",
342-
model_type=["late-interaction"],
343-
languages=["eng-Latn"],
344-
revision="530094e83a40ca4edcb5c9e5ddfa61a4b5ea0d2f",
345-
release_date="2025-03-31",
346-
modalities=["image", "text"],
347-
n_parameters=7_000_000_000,
348-
n_embedding_parameters=None,
349-
memory_usage_mb=14400,
350-
max_tokens=128000,
351-
embed_dim=128,
352-
license="apache-2.0",
353-
open_weights=True,
354-
public_training_code="https://github.com/nomic-ai/colpali",
355-
public_training_data="https://huggingface.co/datasets/vidore/colpali_train_set",
356-
framework=["ColPali", "safetensors"],
357-
reference="https://huggingface.co/nomic-ai/colnomic-embed-multimodal-7b",
358-
similarity_fn_name="MaxSim",
359-
use_instructions=True,
360-
training_datasets=COLPALI_TRAINING_DATA,
361-
citation=COLPALI_CITATION,
362-
)
363341

364342
COLNOMIC_CITATION = """
365343
@misc{nomicembedmultimodal2025,
@@ -408,7 +386,7 @@ def similarity(self, a, b):
408386
)
409387

410388
colnomic_7b = ModelMeta(
411-
loader=ColQwen2Wrapper,
389+
loader=ColQwen2_5Wrapper,
412390
loader_kwargs=dict(
413391
torch_dtype=torch.float16,
414392
),

mteb/models/model_meta.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import json
44
import logging
55
import warnings
6-
from collections.abc import Callable, Sequence
6+
from collections.abc import Callable
77
from dataclasses import field
88
from enum import Enum
99
from functools import partial
@@ -12,9 +12,7 @@
1212

1313
import numpy as np
1414
from huggingface_hub import (
15-
GitCommitInfo,
1615
ModelCard,
17-
ModelCardData,
1816
get_safetensors_metadata,
1917
hf_hub_download,
2018
list_repo_commits,
@@ -33,17 +31,24 @@
3331
from sentence_transformers.models import Transformer
3432
from torch import nn
3533
from transformers import AutoConfig
36-
from typing_extensions import Self
3734

3835
from mteb._helpful_enum import HelpfulStrEnum
3936
from mteb.languages import check_language_code
40-
from mteb.models.models_protocols import EncoderProtocol, MTEBModels
37+
from mteb.models.models_protocols import MTEBModels
4138
from mteb.types import ISOLanguageScript, Licenses, Modalities, StrDate, StrURL
4239

4340
if TYPE_CHECKING:
41+
from collections.abc import Sequence
42+
43+
from huggingface_hub import (
44+
GitCommitInfo,
45+
ModelCardData,
46+
)
4447
from sentence_transformers import CrossEncoder, SentenceTransformer
48+
from typing_extensions import Self
4549

4650
from mteb.abstasks import AbsTask
51+
from mteb.models.models_protocols import EncoderProtocol
4752

4853

4954
logger = logging.getLogger(__name__)
@@ -512,7 +517,7 @@ def is_zero_shot_on(self, tasks: Sequence[AbsTask] | Sequence[str]) -> bool | No
512517
if isinstance(tasks[0], str):
513518
benchmark_datasets = set(tasks)
514519
else:
515-
tasks = cast(Sequence["AbsTask"], tasks)
520+
tasks = cast("Sequence[AbsTask]", tasks)
516521
benchmark_datasets = set()
517522
for task in tasks:
518523
benchmark_datasets.add(task.metadata.name)
@@ -567,7 +572,7 @@ def zero_shot_percentage(
567572
if isinstance(tasks[0], str):
568573
benchmark_datasets = set(tasks)
569574
else:
570-
tasks = cast(Sequence["AbsTask"], tasks)
575+
tasks = cast("Sequence[AbsTask]", tasks)
571576
benchmark_datasets = {task.metadata.name for task in tasks}
572577
overlap = training_datasets & benchmark_datasets
573578
perc_overlap = 100 * (len(overlap) / len(benchmark_datasets))

0 commit comments

Comments
 (0)