Refactor/examples for modules (#54)

Darinochka · web-flow · commit 55191d50cd2f · 2024-12-04T19:34:15.000+03:00
diff --git a/autointent/modules/prediction/_adaptive.py b/autointent/modules/prediction/_adaptive.py
@@ -45,6 +45,29 @@ class AdaptivePredictor(PredictionModule):
     :ivar _r: Scaling factor for thresholds.
     :ivar tags: List of Tag objects for mutually exclusive classes.
     :ivar name: Name of the predictor, defaults to "adaptive".
+
+    Examples
+    --------
+    >>> from autointent.modules import AdaptivePredictor
+    >>> import numpy as np
+    >>> scores = np.array([[0.8, 0.1, 0.4], [0.2, 0.9, 0.5]])
+    >>> labels = [[1, 0, 0], [0, 1, 0]]
+    >>> search_space = [0.1, 0.2, 0.3, 0.5, 0.7]
+    >>> predictor = AdaptivePredictor(search_space=search_space)
+    >>> predictor.fit(scores, labels)
+    >>> predictions = predictor.predict(scores)
+    >>> print(predictions)
+    [[1 0 0]
+     [0 1 0]]
+
+    Save and load the predictor:
+    >>> predictor.dump("outputs/")
+    >>> predictor_loaded = AdaptivePredictor()
+    >>> predictor_loaded.load("outputs/")
+    >>> predictions = predictor_loaded.predict(scores)
+    >>> print(predictions)
+    [[1 0 0]
+     [0 1 0]]
     """
 
     metadata_dict_name = "metadata.json"
diff --git a/autointent/modules/prediction/_argmax.py b/autointent/modules/prediction/_argmax.py
@@ -23,7 +23,35 @@ class ArgmaxPredictorDumpMetadata(BaseMetadataDict):
 
 
 class ArgmaxPredictor(PredictionModule):
-    """Argmax prediction module."""
+    """
+    Argmax prediction module.
+
+    The ArgmaxPredictor is a simple predictor that selects the class with the highest
+    score (argmax) for single-label classification tasks.
+
+    :ivar n_classes: Number of classes in the dataset.
+
+    Examples
+    --------
+    >>> from autointent.modules import ArgmaxPredictor
+    >>> import numpy as np
+    >>> predictor = ArgmaxPredictor()
+    >>> train_scores = np.array([[0.2, 0.8, 0.0], [0.7, 0.1, 0.2]])
+    >>> labels = [1, 0]  # Single-label targets
+    >>> predictor.fit(train_scores, labels)
+    >>> test_scores = np.array([[0.1, 0.5, 0.4], [0.6, 0.3, 0.1]])
+    >>> predictions = predictor.predict(test_scores)
+    >>> print(predictions)
+    [1 0]
+
+    Save the predictor's state:
+    >>> predictor.dump("outputs/")
+    >>> loaded_predictor = ArgmaxPredictor()
+    >>> loaded_predictor.load("outputs/")
+    >>> loaded_predictions = loaded_predictor.predict(test_scores)
+    >>> print(loaded_predictions)
+    [1 0]
+    """
 
     name = "argmax"
     n_classes: int
diff --git a/autointent/modules/prediction/_jinoos.py b/autointent/modules/prediction/_jinoos.py
@@ -26,7 +26,37 @@ class JinoosPredictorDumpMetadata(BaseMetadataDict):
 
 
 class JinoosPredictor(PredictionModule):
-    """Jinoos predictor module."""
+    """
+    Jinoos predictor module.
+
+    JinoosPredictor predicts the best scores for single-label classification tasks
+    and detects out-of-scope (OOS) samples based on a threshold.
+
+    :ivar thresh: The optimized threshold value for OOS detection.
+    :ivar name: Name of the predictor, defaults to "adaptive".
+    :ivar n_classes: Number of classes determined during fitting.
+
+    Examples
+    --------
+    >>> from autointent.modules import JinoosPredictor
+    >>> import numpy as np
+    >>> scores = np.array([[0.2, 0.8], [0.6, 0.4], [0.1, 0.9]])
+    >>> labels = [1, 0, 1]
+    >>> search_space = [0.3, 0.5, 0.7]
+    >>> predictor = JinoosPredictor(search_space=search_space)
+    >>> predictor.fit(scores, labels)
+    >>> test_scores = np.array([[0.3, 0.7], [0.5, 0.5]])
+    >>> predictions = predictor.predict(test_scores)
+    >>> print(predictions)
+    [1 0]
+
+    Save and load the predictor state:
+    >>> predictor.dump("outputs/")
+    >>> loaded_predictor = JinoosPredictor()
+    >>> loaded_predictor.load("outputs/")
+    >>> print(loaded_predictor.thresh)
+    0.5  # Example threshold from the search space
+    """
 
     thresh: float
     name = "jinoos"
diff --git a/autointent/modules/prediction/_threshold.py b/autointent/modules/prediction/_threshold.py
@@ -29,7 +29,49 @@ class ThresholdPredictorDumpMetadata(BaseMetadataDict):
 
 
 class ThresholdPredictor(PredictionModule):
-    """Threshold predictor module."""
+    """
+    Threshold predictor module.
+
+    ThresholdPredictor uses a predefined threshold (or array of thresholds) to predict
+    labels for single-label or multi-label classification tasks.
+
+    :ivar metadata_dict_name: Filename for saving metadata to disk.
+    :ivar multilabel: If True, the model supports multi-label classification.
+    :ivar n_classes: Number of classes in the dataset.
+    :ivar tags: Tags for predictions (if any).
+    :ivar name: Name of the predictor, defaults to "adaptive".
+
+    Examples
+    --------
+    Single-label classification example:
+    >>> from autointent.modules import ThresholdPredictor
+    >>> import numpy as np
+    >>> scores = np.array([[0.2, 0.8], [0.6, 0.4], [0.1, 0.9]])
+    >>> labels = [1, 0, 1]
+    >>> threshold = 0.5
+    >>> predictor = ThresholdPredictor(thresh=threshold)
+    >>> predictor.fit(scores, labels)
+    >>> test_scores = np.array([[0.3, 0.7], [0.5, 0.5]])
+    >>> predictions = predictor.predict(test_scores)
+    >>> print(predictions)
+    [1 0]
+
+    Multi-label classification example:
+    >>> labels = [[1, 0], [0, 1], [1, 1]]
+    >>> predictor = ThresholdPredictor(thresh=[0.5, 0.5])
+    >>> predictor.fit(scores, labels)
+    >>> test_scores = np.array([[0.3, 0.7], [0.6, 0.4]])
+    >>> predictions = predictor.predict(test_scores)
+    >>> print(predictions)
+    [[0 1] [1 0]]
+
+    Save and load the model:
+    >>> predictor.dump("outputs/")
+    >>> loaded_predictor = ThresholdPredictor(thresh=0.5)
+    >>> loaded_predictor.load("outputs/")
+    >>> print(loaded_predictor.thresh)
+    0.5
+    """
 
     metadata: ThresholdPredictorDumpMetadata
     multilabel: bool
@@ -45,9 +87,6 @@ def __init__(
         Initialize threshold predictor.
 
         :param thresh: Threshold for the scores, shape (n_classes,) or float
-        :param multilabel: If multilabel classification, default False
-        :param n_classes: Number of classes, default None
-        :param tags: Tags for predictions, default None
         """
         self.thresh = thresh
 
diff --git a/autointent/modules/prediction/_tunable.py b/autointent/modules/prediction/_tunable.py
@@ -30,7 +30,48 @@ class TunablePredictorDumpMetadata(BaseMetadataDict):
 
 
 class TunablePredictor(PredictionModule):
-    """Tunable predictor module."""
+    """
+    Tunable predictor module.
+
+    TunablePredictor uses an optimization process to find the best thresholds for predicting labels
+    in single-label or multi-label classification tasks. It is designed for datasets with varying
+    score distributions and supports out-of-scope (OOS) detection.
+
+    :ivar name: Name of the predictor, defaults to "tunable".
+    :ivar multilabel: Whether the task is multi-label classification.
+    :ivar n_classes: Number of classes determined during fitting.
+    :ivar tags: Tags for predictions, if any.
+
+    Examples
+    --------
+    Single-label classification:
+    >>> import numpy as np
+    >>> from autointent.modules import TunablePredictor
+    >>> scores = np.array([[0.2, 0.8], [0.6, 0.4], [0.1, 0.9]])
+    >>> labels = [1, 0, 1]
+    >>> predictor = TunablePredictor(n_trials=100, seed=42)
+    >>> predictor.fit(scores, labels)
+    >>> test_scores = np.array([[0.3, 0.7], [0.5, 0.5]])
+    >>> predictions = predictor.predict(test_scores)
+    >>> print(predictions)
+    [1 0]
+
+    Multi-label classification:
+    >>> labels = [[1, 0], [0, 1], [1, 1]]
+    >>> predictor = TunablePredictor(n_trials=100, seed=42)
+    >>> predictor.fit(scores, labels)
+    >>> test_scores = np.array([[0.3, 0.7], [0.6, 0.4]])
+    >>> predictions = predictor.predict(test_scores)
+    >>> print(predictions)
+    [[0 1] [1 0]]
+
+    Saving and loading the model:
+    >>> predictor.dump("outputs/")
+    >>> loaded_predictor = TunablePredictor()
+    >>> loaded_predictor.load("outputs/")
+    >>> print(loaded_predictor.thresh)
+    [0.5, 0.7]
+    """
 
     name = "tunable"
     multilabel: bool
diff --git a/autointent/modules/retrieval/_vectordb.py b/autointent/modules/retrieval/_vectordb.py
@@ -24,11 +24,39 @@ class VectorDBMetadata(BaseMetadataDict):
 
 
 class VectorDBModule(RetrievalModule):
-    """
+    r"""
     Module for managing retrieval operations using a vector database.
 
-    This class provides methods for indexing, querying, and managing a vector database for tasks
+    VectorDBModule provides methods for indexing, querying, and managing a vector database for tasks
     such as nearest neighbor retrieval.
+
+    :ivar vector_index: The vector index used for nearest neighbor retrieval.
+    :ivar name: Name of the module, defaults to "vector_db".
+
+    Examples
+    --------
+    Creating and fitting the VectorDBModule:
+    >>> from your_module import VectorDBModule
+    >>> utterances = ["hello world", "how are you?", "good morning"]
+    >>> labels = [1, 2, 3]
+    >>> vector_db = VectorDBModule(k=2, embedder_name="some_embedder", db_dir="./db", device="cpu")
+    >>> vector_db.fit(utterances, labels)
+    >>> def retrieval_metric_fn(true_labels, predicted_labels):
+    >>>     # Custom metric function (e.g., accuracy or F1 score)
+    >>>     return sum([1 if true == pred else 0 for true, pred \\
+    >>>         in zip(true_labels, predicted_labels)]) / len(true_labels)
+    >>> score = vector_db.score(context, retrieval_metric_fn)
+    >>> print(score)
+
+    Performing predictions:
+    >>> predictions = vector_db.predict(["how is the weather today?"])
+    >>> print(predictions)
+
+    Saving and loading the model:
+    >>> vector_db.dump("outputs/")
+    >>> loaded_vector_db = VectorDBModule(k=2, embedder_name="some_embedder", db_dir="./db", device="cpu")
+    >>> loaded_vector_db.load("outputs/")
+    >>> print(loaded_vector_db.vector_index)
     """
 
     vector_index: VectorIndex
diff --git a/autointent/modules/scoring/_description/description.py b/autointent/modules/scoring/_description/description.py
@@ -27,7 +27,39 @@ class DescriptionScorerDumpMetadata(TypedDict):
 
 
 class DescriptionScorer(ScoringModule):
-    """Scoring module that scores utterances based on similarity to intent descriptions."""
+    r"""
+    Scoring module that scores utterances based on similarity to intent descriptions.
+
+    DescriptionScorer embeds both the utterances and the intent descriptions, then computes a similarity score
+    between the two, using either cosine similarity and softmax.
+
+    :ivar weights_file_name: Filename for saving the description vectors (`description_vectors.npy`).
+    :ivar embedder: The embedder used to generate embeddings for utterances and descriptions.
+    :ivar precomputed_embeddings: Flag indicating whether precomputed embeddings are used.
+    :ivar embedding_model_subdir: Directory for storing the embedder's model files.
+    :ivar _vector_index: Internal vector index used when embeddings are precomputed.
+    :ivar db_dir: Directory path where the vector database is stored.
+    :ivar name: Name of the scorer, defaults to "description".
+
+    Examples
+    --------
+    Creating and fitting the DescriptionScorer
+    >>> from autointent.modules import DescriptionScorer
+    >>> utterances = ["what is your name?", "how old are you?"]
+    >>> labels = [0, 1]
+    >>> descriptions = ["greeting", "age-related question"]
+    >>> scorer = DescriptionScorer(embedder_name="your_embedder", temperature=1.0)
+    >>> scorer.fit(utterances, labels, descriptions)
+
+    Predicting scores:
+    >>> scores = scorer.predict(["tell me about your age?"])
+    >>> print(scores)  # Outputs similarity scores for the utterance against all descriptions
+
+    Saving and loading the scorer:
+    >>> scorer.dump("outputs/")
+    >>> loaded_scorer = DescriptionScorer(embedder_name="your_embedder")
+    >>> loaded_scorer.load("outputs/")
+    """
 
     weights_file_name: str = "description_vectors.npy"
     embedder: Embedder
diff --git a/autointent/modules/scoring/_dnnc/dnnc.py b/autointent/modules/scoring/_dnnc/dnnc.py
@@ -31,7 +31,7 @@ class DNNCScorerDumpMetadata(BaseMetadataDict):
 
 
 class DNNCScorer(ScoringModule):
-    """
+    r"""
     Scoring module for intent classification using a discriminative nearest neighbor classification (DNNC).
 
     This module uses a CrossEncoder for scoring candidate intents and can optionally
@@ -50,6 +50,45 @@ class DNNCScorer(ScoringModule):
           url={https://arxiv.org/abs/2010.13009},
         }
 
+    :ivar crossencoder_subdir: Subdirectory for storing the cross-encoder model (`crossencoder`).
+    :ivar model: The model used for scoring, which could be a `CrossEncoder` or a `CrossEncoderWithLogreg`.
+    :ivar prebuilt_index: Flag indicating whether a prebuilt vector index is used.
+    :ivar _db_dir: Path to the database directory where the vector index is stored.
+    :ivar name: Name of the scorer, defaults to "dnnc".
+
+    Examples
+    --------
+    Creating and fitting the DNNCScorer:
+    >>> from autointent.modules import DNNCScorer
+    >>> utterances = ["what is your name?", "how are you?"]
+    >>> labels = ["greeting", "greeting"]
+    >>> scorer = DNNCScorer(
+    >>>     cross_encoder_name="cross_encoder_model",
+    >>>     embedder_name="embedder_model",
+    >>>     k=5,
+    >>>     db_dir="/path/to/database",
+    >>>     device="cuda",
+    >>>     train_head=True,
+    >>>     batch_size=32,
+    >>>     max_length=128
+    >>> )
+    >>> scorer.fit(utterances, labels)
+
+    Predicting scores:
+    >>> test_utterances = ["Hello!", "What's up?"]
+    >>> scores = scorer.predict(test_utterances)
+    >>> print(scores)  # Outputs similarity scores for the utterances
+
+    Saving and loading the scorer:
+    >>> scorer.dump("outputs/")
+    >>> loaded_scorer = DNNCScorer(
+    >>>     cross_encoder_name="cross_encoder_model",
+    >>>     embedder_name="embedder_model",
+    >>>     k=5,
+    >>>     db_dir="/path/to/database",
+    >>>     device="cuda"
+    >>> )
+    >>> loaded_scorer.load("outputs/")
     """
 
     name = "dnnc"
diff --git a/autointent/modules/scoring/_dnnc/head_training.py b/autointent/modules/scoring/_dnnc/head_training.py
@@ -59,11 +59,36 @@ def construct_samples(
 
 
 class CrossEncoderWithLogreg:
-    """
+    r"""
     Cross-encoder with logistic regression for binary classification.
 
     This class uses a SentenceTransformers CrossEncoder model to extract features
     and LogisticRegressionCV for classification.
+
+    :ivar cross_encoder: The CrossEncoder model used to extract features.
+    :ivar batch_size: Batch size for processing text pairs.
+    :ivar _clf: The trained LogisticRegressionCV classifier.
+    :ivar model_subdir: Directory for storing the cross-encoder model files.
+
+    Examples
+    --------
+    Creating and fitting the CrossEncoderWithLogreg:
+    >>> from autointent.modules import CrossEncoderWithLogreg
+    >>> from sentence_transformers import CrossEncoder
+    >>> model = CrossEncoder("cross-encoder-model")
+    >>> scorer = CrossEncoderWithLogreg(model)
+    >>> utterances = ["What is your name?", "How old are you?"]
+    >>> labels = [1, 0]
+    >>> scorer.fit(utterances, labels)
+
+    Predicting probabilities:
+    >>> test_pairs = [["What is your name?", "Hello!"], ["How old are you?", "What is your age?"]]
+    >>> probs = scorer.predict(test_pairs)
+    >>> print(probs)
+
+    Saving and loading the model:
+    >>> scorer.save("outputs/")
+    >>> loaded_scorer = CrossEncoderWithLogreg.load("outputs/")
     """
 
     def __init__(self, model: CrossEncoder, batch_size: int = 326) -> None:
diff --git a/autointent/modules/scoring/_knn/knn.py b/autointent/modules/scoring/_knn/knn.py
diff --git a/autointent/modules/scoring/_linear.py b/autointent/modules/scoring/_linear.py
diff --git a/autointent/modules/scoring/_mlknn/mlknn.py b/autointent/modules/scoring/_mlknn/mlknn.py