wip

stephantul · stephantul · commit b6080819c9bd · 2025-07-29T10:23:56.000+02:00
diff --git a/model2vec/inference/model.py b/model2vec/inference/model.py
@@ -3,7 +3,7 @@
 import re
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import Sequence, TypeVar
+from typing import Sequence, TypeVar, cast
 
 import huggingface_hub
 import numpy as np
@@ -273,14 +273,14 @@ def save_pipeline(pipeline: StaticModelPipeline, folder_path: str | Path) -> Non
     )
 
 
-def _is_multi_label_shaped(y: LabelType) -> bool:
+def _is_multi_label_shaped(y: list[int] | list[str] | list[list[int]] | list[list[str]]) -> bool:
     """Check if the labels are in a multi-label shape."""
     return isinstance(y, (list, tuple)) and len(y) > 0 and isinstance(y[0], (list, tuple, set))
 
 
 def evaluate_single_or_multi_label(
     predictions: np.ndarray,
-    y: LabelType,
+    y: list[int] | list[str] | list[list[int]] | list[list[str]],
     output_dict: bool = False,
 ) -> str | dict[str, dict[str, float]]:
     """
@@ -292,16 +292,22 @@ def evaluate_single_or_multi_label(
     :return: A classification report.
     """
     if _is_multi_label_shaped(y):
+        # Cast because the type checker doesn't understand that y is a list of lists.
+        y = cast(list[list[str]] | list[list[int]], y)
         classes = sorted(set([label for labels in y for label in labels]))
         mlb = MultiLabelBinarizer(classes=classes)
-        y = mlb.fit_transform(y)
-        predictions = mlb.transform(predictions)
-    elif isinstance(y[0], (str, int)):
-        classes = sorted(set(y))
+        y_transformed = mlb.fit_transform(y)
+        predictions_transformed = mlb.transform(predictions)
+    else:
+        if all(isinstance(label, (str, int)) for label in y):
+            y = cast(list[str] | list[int], y)
+            classes = sorted(set(y))
+        y_transformed = np.array(y)
+        predictions_transformed = np.array(predictions)
 
     report = classification_report(
-        y,
-        predictions,
+        y_transformed,
+        predictions_transformed,
         output_dict=output_dict,
         zero_division=0,
     )
diff --git a/model2vec/model.py b/model2vec/model.py
@@ -452,7 +452,7 @@ def _encode_batch(self, sentences: Sequence[str], max_length: int | None) -> np.
                     id_list_remapped = [self.token_mapping.get(token_id, token_id) for token_id in id_list]
                 emb = self.embedding[id_list_remapped]
                 if self.weights is not None:
-                    emb = (emb * self.weights[id_list][:, None])
+                    emb = emb * self.weights[id_list][:, None]
                 emb = emb.mean(axis=0)
 
                 out.append(emb)
@@ -514,4 +514,4 @@ def load_local(cls: type[StaticModel], path: PathLike) -> StaticModel:
 
         embeddings, tokenizer, config = load_local_model(path)
 
-        return StaticModel(embeddings, tokenizer, config)
+        return StaticModel(embeddings, tokenizer, config=config)
diff --git a/model2vec/train/base.py b/model2vec/train/base.py
@@ -16,7 +16,16 @@
 
 
 class FinetunableStaticModel(nn.Module):
-    def __init__(self, *, vectors: torch.Tensor, tokenizer: Tokenizer, out_dim: int = 2, pad_id: int = 0, token_mapping: list[int] | None = None) -> None:
+    def __init__(
+        self,
+        *,
+        vectors: torch.Tensor,
+        tokenizer: Tokenizer,
+        out_dim: int = 2,
+        pad_id: int = 0,
+        token_mapping: list[int] | None = None,
+        weights: torch.Tensor | None = None,
+    ) -> None:
         """
         Initialize a trainable StaticModel from a StaticModel.
 
@@ -45,7 +54,7 @@ def __init__(self, *, vectors: torch.Tensor, tokenizer: Tokenizer, out_dim: int
         self.token_mapping = nn.Parameter(self.token_mapping, requires_grad=False)
         self.embeddings = nn.Embedding.from_pretrained(vectors.clone(), freeze=False, padding_idx=pad_id)
         self.head = self.construct_head()
-        self.w = self.construct_weights()
+        self.w = self.construct_weights() if weights is None else nn.Parameter(weights, requires_grad=True)
         self.tokenizer = tokenizer
 
     def construct_weights(self) -> nn.Parameter:
@@ -70,6 +79,7 @@ def from_pretrained(
     def from_static_model(cls: type[ModelType], *, model: StaticModel, out_dim: int = 2, **kwargs: Any) -> ModelType:
         """Load the model from a static model."""
         model.embedding = np.nan_to_num(model.embedding)
+        weights = torch.from_numpy(model.weights) if model.weights is not None else None
         embeddings_converted = torch.from_numpy(model.embedding)
         if model.token_mapping is not None:
             token_mapping = [i for _, i in sorted(model.token_mapping.items(), key=lambda x: x[0])]
@@ -81,6 +91,7 @@ def from_static_model(cls: type[ModelType], *, model: StaticModel, out_dim: int
             out_dim=out_dim,
             tokenizer=model.tokenizer,
             token_mapping=token_mapping,
+            weights=weights,
             **kwargs,
         )
 
@@ -139,7 +150,9 @@ def to_static_model(self) -> StaticModel:
         w = torch.sigmoid(self.w).detach().cpu().numpy()
         token_mapping = {i: int(token_id) for i, token_id in enumerate(self.token_mapping.tolist())}
 
-        return StaticModel(vectors=emb, weights=w, tokenizer=self.tokenizer, normalize=True, token_mapping=token_mapping)
+        return StaticModel(
+            vectors=emb, weights=w, tokenizer=self.tokenizer, normalize=True, token_mapping=token_mapping
+        )
 
 
 class TextDataset(Dataset):
@@ -169,7 +182,7 @@ def collate_fn(batch: list[tuple[list[list[int]], int]]) -> tuple[torch.Tensor,
         """Collate function."""
         texts, targets = zip(*batch)
 
-        tensors = [torch.LongTensor(x) for x in texts]
+        tensors: list[torch.Tensor] = [torch.LongTensor(x) for x in texts]
         padded = pad_sequence(tensors, batch_first=True, padding_value=0)
 
         return padded, torch.stack(targets)
diff --git a/model2vec/train/classifier.py b/model2vec/train/classifier.py
@@ -4,7 +4,7 @@
 from collections import Counter
 from itertools import chain
 from tempfile import TemporaryDirectory
-from typing import TypeVar, cast
+from typing import Generic, TypeVar, cast
 
 import lightning as pl
 import numpy as np
@@ -25,10 +25,11 @@
 logger = logging.getLogger(__name__)
 _RANDOM_SEED = 42
 
-LabelType = TypeVar("LabelType", list[str], list[int], list[list[str]], list[list[int]])
+PossibleLabels = list[str] | list[list[str]]
+LabelType = TypeVar("LabelType", list[str], list[list[str]])
 
 
-class StaticModelForClassification(FinetunableStaticModel):
+class StaticModelForClassification(FinetunableStaticModel, Generic[LabelType]):
     def __init__(
         self,
         *,
@@ -39,15 +40,23 @@ def __init__(
         out_dim: int = 2,
         pad_id: int = 0,
         token_mapping: list[int] | None = None,
+        weights: torch.Tensor | None = None,
     ) -> None:
         """Initialize a standard classifier model."""
         self.n_layers = n_layers
         self.hidden_dim = hidden_dim
         # Alias: Follows scikit-learn. Set to dummy classes
-        self.classes_: list[str] = [str(x) for x in range(out_dim)]
+        self.classes_: list[str] = ["0", "1"]
         # multilabel flag will be set based on the type of `y` passed to fit.
         self.multilabel: bool = False
-        super().__init__(vectors=vectors, out_dim=out_dim, pad_id=pad_id, tokenizer=tokenizer, token_mapping=token_mapping)
+        super().__init__(
+            vectors=vectors,
+            out_dim=out_dim,
+            pad_id=pad_id,
+            tokenizer=tokenizer,
+            token_mapping=token_mapping,
+            weights=weights,
+        )
 
     @property
     def classes(self) -> np.ndarray:
@@ -166,7 +175,7 @@ def fit(
         :param device: The device to train on. If this is "auto", the device is chosen automatically.
         :param X_val: The texts to be used for validation.
         :param y_val: The labels to be used for validation.
-        :param class_weight: The weight of the classes. If None, all classes are weighted equally. Must 
+        :param class_weight: The weight of the classes. If None, all classes are weighted equally. Must
             have the same length as the number of classes.
         :return: The fitted model.
         :raises ValueError: If either X_val or y_val are provided, but not both.
@@ -202,7 +211,7 @@ def fit(
             base_number = int(min(max(1, (len(train_texts) / 30) // 32), 16))
             batch_size = int(base_number * 32)
             logger.info("Batch size automatically set to %d.", batch_size)
-        
+
         if class_weight is not None:
             if len(class_weight) != len(self.classes_):
                 raise ValueError("class_weight must have the same length as the number of classes.")
@@ -284,11 +293,8 @@ def _initialize(self, y: LabelType) -> None:
 
         :param y: The labels.
         :raises ValueError: If the labels are inconsistent.
-        """
-        if isinstance(y[0], (str, int)):
-            # Check if all labels are strings or integers.
-            if not all(isinstance(label, (str, int)) for label in y):
-                raise ValueError("Inconsistent label types in y. All labels must be strings or integers.")
+        """        
+        if all(isinstance(label, str) for label in y):
             self.multilabel = False
             classes = sorted(set(y))
         else:
@@ -330,13 +336,13 @@ def _prepare_dataset(self, X: list[str], y: LabelType, max_length: int = 512) ->
                 indices = [mapping[label] for label in sample_labels]
                 labels_tensor[i, indices] = 1.0
         else:
-            labels_tensor = torch.tensor([self.classes_.index(label) for label in cast(list[str], y)], dtype=torch.long)
+            labels_tensor = torch.tensor([self.classes_.index(label) for label in y], dtype=torch.long)
         return TextDataset(tokenized, labels_tensor)
 
     def _train_test_split(
         self,
         X: list[str],
-        y: list[str] | list[list[str]],
+        y: LabelType,
         test_size: float,
     ) -> tuple[list[str], list[str], LabelType, LabelType]:
         """
@@ -384,12 +390,18 @@ def to_pipeline(self) -> StaticModelPipeline:
 
 
 class _ClassifierLightningModule(pl.LightningModule):
-    def __init__(self, model: StaticModelForClassification, learning_rate: float, class_weight: torch.Tensor | None = None) -> None:
+    def __init__(
+        self, model: StaticModelForClassification, learning_rate: float, class_weight: torch.Tensor | None = None
+    ) -> None:
         """Initialize the LightningModule."""
         super().__init__()
         self.model = model
         self.learning_rate = learning_rate
-        self.loss_function = nn.CrossEntropyLoss(weight=class_weight) if not model.multilabel else nn.BCEWithLogitsLoss(pos_weight=class_weight)
+        self.loss_function = (
+            nn.CrossEntropyLoss(weight=class_weight)
+            if not model.multilabel
+            else nn.BCEWithLogitsLoss(pos_weight=class_weight)
+        )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """Simple forward pass."""
@@ -408,10 +420,12 @@ def validation_step(self, batch: tuple[torch.Tensor, torch.Tensor], batch_idx: i
         x, y = batch
         head_out, _ = self.model(x)
         loss = self.loss_function(head_out, y)
+
+        accuracy: float
         if self.model.multilabel:
             preds = (torch.sigmoid(head_out) > 0.5).float()
             # Multilabel accuracy is defined as the Jaccard score averaged over samples.
-            accuracy = jaccard_score(y.cpu(), preds.cpu(), average="samples")
+            accuracy = cast(float, jaccard_score(y.cpu(), preds.cpu(), average="samples"))
         else:
             accuracy = (head_out.argmax(dim=1) == y).float().mean()
         self.log("val_loss", loss)
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -95,7 +95,7 @@ def mock_vectors() -> np.ndarray:
 @pytest.fixture
 def mock_config() -> dict[str, Any]:
     """Create a mock config."""
-    return {"some_config": "value", "token_mapping": [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)]}
+    return {"some_config": "value"}
 
 
 @pytest.fixture(scope="session")
diff --git a/tests/test_trainable.py b/tests/test_trainable.py
@@ -91,8 +91,8 @@ def test_conversion(mock_trained_pipeline: StaticModelForClassification) -> None
     """Test the conversion to numpy."""
     staticmodel = mock_trained_pipeline.to_static_model()
     with torch.no_grad():
-        result_1 = mock_trained_pipeline._encode(torch.tensor([[0, 1], [1, 0]]).long()).numpy()
-    result_2 = staticmodel.embedding[[[0, 1], [1, 0]]].mean(0)
+        result_1 = mock_trained_pipeline._encode(torch.tensor([[1, 2], [2, 1]]).long()).numpy()
+    result_2 = staticmodel.embedding[[[1, 2], [2, 1]]].mean(0)
     result_2 /= np.linalg.norm(result_2, axis=1, keepdims=True)
 
     assert np.allclose(result_1, result_2)
@@ -174,6 +174,7 @@ def test_y_val_none() -> None:
         model.fit(X, y, X_val=None, y_val=y_val)
     model.fit(X, y, X_val=None, y_val=None)
 
+
 def test_class_weight() -> None:
     """Test the class weight function."""
     tokenizer = AutoTokenizer.from_pretrained("tests/data/test_tokenizer").backend_tokenizer