fix ruff and mypy

riapush · riapush · commit abb4e349218c · 2025-04-16T20:38:36.000+03:00
diff --git a/autointent/modules/scoring/_cnn/cnn.py b/autointent/modules/scoring/_cnn/cnn.py
@@ -1,14 +1,16 @@
 """CNNScorer class for scoring."""
 
-from collections import Counter
+from __future__ import annotations
+
 import re
-from typing import Any
+from collections import Counter
+from typing import Any, Dict, List, Optional, Union
 
 import numpy as np
 import numpy.typing as npt
-from torch import nn
 import torch
-from torch.utils.data import TensorDataset, DataLoader
+from torch import nn, Tensor
+from torch.utils.data import DataLoader, TensorDataset
 
 from autointent import Context
 from autointent._callbacks import REPORTERS_NAMES
@@ -21,8 +23,6 @@ class CNNScorer(BaseScorer):
     """Convolutional Neural Network (CNN) scorer for intent classification."""
 
     name = "cnn"
-    _n_classes: int
-    _multilabel: bool
     supports_multilabel = True
     supports_multiclass = True
 
@@ -33,8 +33,8 @@ def __init__(
         batch_size: int = 8,
         learning_rate: float = 5e-5,
         seed: int = 0,
-        report_to: REPORTERS_NAMES | None = None,
-        **cnn_kwargs: dict[str, Any],
+        report_to: REPORTERS_NAMES | None = None, # type: ignore[no-any-return]
+        **cnn_kwargs: Dict[str, Any],
     ) -> None:
         self.max_seq_length = max_seq_length
         self.num_train_epochs = num_train_epochs
@@ -45,11 +45,14 @@ def __init__(
         self.cnn_config = cnn_kwargs
         
         # Will be initialized during fit()
-        self._model = None
-        self._vocab = None
+        self._model: Optional[TextCNN] = None
+        self._vocab: Optional[Dict[str, int]] = None
         self._padding_idx = 0
         self._unk_token = "<UNK>"  # noqa: S105
         self._pad_token = "<PAD>"  # noqa: S105
+        self._unk_idx = 1
+        self._n_classes: int = 0
+        self._multilabel: bool = False
 
     @classmethod
     def from_context(
@@ -59,7 +62,7 @@ def from_context(
         batch_size: int = 8,
         learning_rate: float = 5e-5,
         seed: int = 0,
-        **cnn_kwargs: dict[str, Any],
+        **cnn_kwargs: Dict[str, Any],
     ) -> "CNNScorer":
         return cls(
             num_train_epochs=num_train_epochs,
@@ -70,22 +73,23 @@ def from_context(
             **cnn_kwargs,
         )
 
-    def fit(self, utterances: list[str], labels: ListOfLabels, clear_cache: bool = False) -> None:
-        if clear_cache:
-            self.clear_cache()
-        
+    def fit(self, utterances: List[str], labels: ListOfLabels) -> None:
         self._validate_task(labels)
-        self._multilabel = isinstance(labels[0], list | np.ndarray)
+        self._multilabel = isinstance(labels[0], (list, np.ndarray))
+        self._n_classes = len(labels[0]) if self._multilabel else len(set(labels))
         
         # Build vocabulary and tokenize
         self._build_vocab(utterances)
         
         # Convert text to padded indices
         x = self._text_to_indices(utterances)
-        x = torch.tensor(x, dtype=torch.long)
-        y = torch.tensor(labels, dtype=torch.long)
+        x_tensor = torch.tensor(x, dtype=torch.long)
+        y_tensor = torch.tensor(labels, dtype=torch.long if not self._multilabel else torch.float)
         
         # Initialize model
+        if self._vocab is None:
+            raise RuntimeError("Vocabulary not built")
+        
         self._model = TextCNN(
             vocab_size=len(self._vocab),
             n_classes=self._n_classes,
@@ -98,22 +102,21 @@ def fit(self, utterances: list[str], labels: ListOfLabels, clear_cache: bool = F
         )
         
         # Training
-        self._train_model(x, y)
+        self._train_model(x_tensor, y_tensor)
 
-    def predict(self, utterances: list[str]) -> npt.NDArray[Any]:
+    def predict(self, utterances: List[str]) -> npt.NDArray[Any]:
         if self._model is None:
-            error_msg = "Model not trained. Call fit() first."
-            raise RuntimeError(error_msg)
+            raise RuntimeError("Model not trained. Call fit() first.")
         
         x = self._text_to_indices(utterances)
-        x = torch.tensor(x, dtype=torch.long)
+        x_tensor = torch.tensor(x, dtype=torch.long)
         
         self._model.eval()
-        all_probs = []
+        all_probs: List[npt.NDArray[Any]] = []
         
         with torch.no_grad():
-            for i in range(0, len(x), self.batch_size):
-                batch_x = x[i:i+self.batch_size]
+            for i in range(0, len(x_tensor), self.batch_size):
+                batch_x = x_tensor[i:i+self.batch_size]
                 outputs = self._model(batch_x)
                 if self._multilabel:
                     probs = torch.sigmoid(outputs).cpu().numpy()
@@ -123,9 +126,9 @@ def predict(self, utterances: list[str]) -> npt.NDArray[Any]:
         
         return np.concatenate(all_probs, axis=0) if all_probs else np.array([])
 
-    def _build_vocab(self, utterances: list[str]) -> None:
+    def _build_vocab(self, utterances: List[str]) -> None:
         """Build vocabulary from training utterances."""
-        word_counts = Counter()
+        word_counts: Dict[str, int] = Counter()
         for utterance in utterances:
             words = re.findall(r"\w+", utterance.lower())
             word_counts.update(words)
@@ -137,20 +140,26 @@ def _build_vocab(self, utterances: list[str]) -> None:
         }
         
         # Add words to vocabulary
+        if self._vocab is None:
+            raise RuntimeError("Vocabulary not initialized")
+            
         for word, _ in word_counts.most_common():
             if word not in self._vocab:
                 self._vocab[word] = len(self._vocab)
         
         self._unk_idx = 1
         self._padding_idx = 0
 
-    def _text_to_indices(self, utterances: list[str]) -> list[list[int]]:
+    def _text_to_indices(self, utterances: List[str]) -> List[List[int]]:
         """Convert utterances to padded sequences of word indices."""
-        sequences = []
+        if self._vocab is None:
+            raise RuntimeError("Vocabulary not built")
+            
+        sequences: List[List[int]] = []
         for utterance in utterances:
             words = re.findall(r"\w+", utterance.lower())
             # Convert words to indices, using UNK for unknown words
-            seq = [self._vocab.get(word, self._unk_idx) for word in words]
+            seq = [self._vocab.get(word, self._unk_idx) for word in words]  # type: ignore
             # Truncate if too long
             seq = seq[:self.max_seq_length]
             # Pad if too short
@@ -162,7 +171,10 @@ def clear_cache(self) -> None:
         self._model = None
         torch.cuda.empty_cache()
 
-    def _train_model(self, x: torch.Tensor, y: torch.Tensor) -> None:
+    def _train_model(self, x: Tensor, y: Tensor) -> None:
+        if self._model is None:
+            raise RuntimeError("Model not initialized")
+            
         dataset = TensorDataset(x, y)
         dataloader = DataLoader(
             dataset,
@@ -182,4 +194,4 @@ def _train_model(self, x: torch.Tensor, y: torch.Tensor) -> None:
                 loss.backward()
                 optimizer.step()
         
-        self._model.eval()
+        self._model.eval()
diff --git a/autointent/modules/scoring/_cnn/textcnn.py b/autointent/modules/scoring/_cnn/textcnn.py
@@ -1,37 +1,54 @@
+"""TextCNN model for text classification."""
+
+from typing import Optional, Tuple
+
 import torch
-import torch.nn as nn
+from torch import nn
 import torch.nn.functional as F
 
+
 class TextCNN(nn.Module):
-    def __init__(self, 
-                 vocab_size, 
-                 n_classes, 
-                 embed_dim=128, 
-                 kernel_sizes=(3, 4, 5), 
-                 num_filters=100, 
-                 dropout=0.1,
-                 padding_idx=0,
-                 pretrained_embs=None
-        ):
-        super(TextCNN, self).__init__()
+    """TextCNN model implementation."""
+
+    def __init__(
+        self,
+        vocab_size: int,
+        n_classes: int,
+        embed_dim: int = 128,
+        kernel_sizes: Tuple[int, ...] = (3, 4, 5),
+        num_filters: int = 100,
+        dropout: float = 0.1,
+        padding_idx: int = 0,
+        pretrained_embs: Optional[torch.Tensor] = None,
+    ) -> None:
+        """Initialize TextCNN model."""
+        super().__init__()
         
-        if pretrained_embs != None:
+        if pretrained_embs is not None:
             _, embed_dim = pretrained_embs.shape
             self.embedding = nn.Embedding.from_pretrained(pretrained_embs, freeze=True)
         else:
-            self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=padding_idx)
+            self.embedding = nn.Embedding(
+                num_embeddings=vocab_size,
+                embedding_dim=embed_dim,
+                padding_idx=padding_idx
+            )
+        
         self.convs = nn.ModuleList([
-            nn.Conv1d(in_channels=embed_dim, out_channels=num_filters, kernel_size=k) for k in kernel_sizes
+            nn.Conv1d(
+                in_channels=embed_dim,
+                out_channels=num_filters,
+                kernel_size=k
+            ) for k in kernel_sizes
         ])
         self.dropout = nn.Dropout(dropout)
         self.fc = nn.Linear(num_filters * len(kernel_sizes), n_classes)
 
-    def forward(self, x):
-        x = self.embedding(x)  
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward pass of the model."""
+        x = self.embedding(x)
         x = x.permute(0, 2, 1)
-        x = [F.relu(conv(x)).max(dim=2)[0] for conv in self.convs] 
-        x = torch.cat(x, dim=1) 
+        x = [F.relu(conv(x)).max(dim=2)[0] for conv in self.convs]
+        x = torch.cat(x, dim=1)
         x = self.dropout(x)
-        x = self.fc(x)
-
-        return x
+        return self.fc(x)