docs: refactor utils.selection and uncertainty

nikolay-bushkov · nikolay-bushkov · commit c8b3fa7280e9 · 2018-10-01T14:20:33.000+03:00
diff --git a/modAL/uncertainty.py b/modAL/uncertainty.py
@@ -1,35 +1,27 @@
 """
 Uncertainty measures and uncertainty based sampling strategies for the active learning models.
 """
+from typing import Tuple
 
 import numpy as np
 from scipy.stats import entropy
 from sklearn.exceptions import NotFittedError
+from sklearn.base import BaseEstimator
 
 from modAL.utils.selection import multi_argmax
+from modAL.utils.data import modALinput
 
 
-def classifier_uncertainty(classifier, X, **predict_proba_kwargs):
+def classifier_uncertainty(classifier: BaseEstimator, X: modALinput, **predict_proba_kwargs) -> np.ndarray:
     """
     Classification uncertainty of the classifier for the provided samples.
 
-    :param classifier:
-        The classifier for which the uncertainty is to be measured.
-    :type classifier:
-        sklearn classifier object, for instance sklearn.ensemble.RandomForestClassifier
+    Args:
+        classifier: The classifier for which the uncertainty is to be measured.
+        X: The samples for which the uncertainty of classification is to be measured.
+        **predict_proba_kwargs: Keyword arguments to be passed for the :meth:`predict_proba` of the classifier.
 
-    :param X:
-        The samples for which the uncertainty of classification is to be measured.
-    :type X:
-        numpy.ndarray of shape (n_samples, n_features)
-
-    :param predict_proba_kwargs:
-        Keyword arguments to be passed for the predict_proba method of the classifier.
-    :type predict_proba_kwargs:
-        keyword arguments
-
-    :returns:
-      - **uncertainty** *(numpy.ndarray of shape (n_samples, ))* --
+    Returns:
         Classifier uncertainty, which is 1 - P(prediction is correct).
     """
     # calculate uncertainty for each point provided
@@ -43,36 +35,23 @@ def classifier_uncertainty(classifier, X, **predict_proba_kwargs):
     return uncertainty
 
 
-def classifier_margin(classifier, X, **predict_proba_kwargs):
+def classifier_margin(classifier: BaseEstimator, X: modALinput, **predict_proba_kwargs) -> np.ndarray:
     """
-    Classification margin uncertainty of the classifier for the provided samples.
-    This uncertainty measure takes the first and second most likely predictions
-    and takes the difference of their probabilities, which is the margin.
-
-    :param classifier:
-        The classifier for which the uncertainty is to be measured
-    :type classifier:
-        sklearn classifier object, for instance sklearn.ensemble.RandomForestClassifier
-
-    :param X:
-        The samples for which the uncertainty of classification is to be measured
-    :type X:
-        numpy.ndarray of shape (n_samples, n_features)
-
-    :param predict_proba_kwargs:
-        Keyword arguments to be passed for the predict_proba method of the classifier
-    :type predict_proba_kwargs:
-        keyword arguments
-
-    :returns:
-      - **margin** *(numpy.ndarray of shape (n_samples, ))* --
-        Margin uncertainty, which is the difference of the probabilities of first
-        and second most likely predictions.
+    Classification margin uncertainty of the classifier for the provided samples. This uncertainty measure takes the
+    first and second most likely predictions and takes the difference of their probabilities, which is the margin.
+
+    Args:
+        classifier: The classifier for which the prediction margin is to be measured.
+        X: The samples for which the prediction margin of classification is to be measured.
+        **predict_proba_kwargs: Keyword arguments to be passed for the :meth:`predict_proba` of the classifier.
+
+    Returns:
+        Margin uncertainty, which is the difference of the probabilities of first and second most likely predictions.
     """
     try:
         classwise_uncertainty = classifier.predict_proba(X, **predict_proba_kwargs)
     except NotFittedError:
-        return np.zeros(shape=(len(X), ))
+        return np.zeros(shape=(X.shape[0], ))
 
     if classwise_uncertainty.shape[1] == 1:
         return np.zeros(shape=(classwise_uncertainty.shape[0],))
@@ -83,140 +62,80 @@ def classifier_margin(classifier, X, **predict_proba_kwargs):
     return margin
 
 
-def classifier_entropy(classifier, X, **predict_proba_kwargs):
+def classifier_entropy(classifier: BaseEstimator, X: modALinput, **predict_proba_kwargs) -> np.ndarray:
     """
     Entropy of predictions of the for the provided samples.
 
-    :param classifier:
-        The classifier for which the prediction entropy is to be measured.
-    :type classifier:
-        sklearn classifier object, for instance sklearn.ensemble.RandomForestClassifier
-
-    :param X:
-        The samples for which the prediction entropy is to be measured.
-    :type X:
-        numpy.ndarray of shape (n_samples, n_features)
-
-    :param predict_proba_kwargs:
-        Keyword arguments to be passed for the predict_proba method of the classifier.
-    :type predict_proba_kwargs:
-        keyword arguments
+    Args:
+        classifier: The classifier for which the prediction entropy is to be measured.
+        X: The samples for which the prediction entropy is to be measured.
+        **predict_proba_kwargs: Keyword arguments to be passed for the :meth:`predict_proba` of the classifier.
 
-    :returns:
-      - **entr** *(numpy.ndarray of shape (n_samples, ))* --
+    Returns:
         Entropy of the class probabilities.
     """
     try:
         classwise_uncertainty = classifier.predict_proba(X, **predict_proba_kwargs)
     except NotFittedError:
-        return np.zeros(shape=(len(X), ))
+        return np.zeros(shape=(X.shape[0], ))
 
     return np.transpose(entropy(np.transpose(classwise_uncertainty)))
 
 
-def uncertainty_sampling(classifier, X, n_instances=1, **uncertainty_measure_kwargs):
+def uncertainty_sampling(classifier: BaseEstimator, X: modALinput,
+                         n_instances: int = 1, **uncertainty_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
     """
     Uncertainty sampling query strategy. Selects the least sure instances for labelling.
 
-    :param classifier:
-        The classifier for which the labels are to be queried.
-    :type classifier:
-        sklearn classifier object, for instance sklearn.ensemble.RandomForestClassifier
-
-    :param X:
-        The pool of samples to query from.
-    :type X:
-        numpy.ndarray of shape (n_samples, n_features)
-
-    :param n_instances:
-        Number of samples to be queried.
-    :type n_instances:
-        int
-
-    :param uncertainty_measure_kwargs:
-        Keyword arguments to be passed for the uncertainty measure function.
-    :type uncertainty_measure_kwargs:
-        keyword arguments
-
-    :returns:
-      - **query_idx** *(numpy.ndarray of shape (n_instances, ))* --
-        The indices of the instances from X chosen to be labelled.
-      - **X[query_idx]** *(numpy.ndarray of shape (n_instances, n_features))* --
-        The instances from X chosen to be labelled.
+    Args:
+        classifier: The classifier for which the labels are to be queried.
+        X: The pool of samples to query from.
+        n_instances: Number of samples to be queried.
+        **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty measure function.
+
+    Returns:
+        The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled.
     """
     uncertainty = classifier_uncertainty(classifier, X, **uncertainty_measure_kwargs)
     query_idx = multi_argmax(uncertainty, n_instances=n_instances)
 
     return query_idx, X[query_idx]
 
 
-def margin_sampling(classifier, X, n_instances=1, **uncertainty_measure_kwargs):
+def margin_sampling(classifier: BaseEstimator, X: modALinput,
+                    n_instances: int = 1, **uncertainty_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
     """
-    Margin sampling query strategy. Selects the instances where the difference between
-    the first most likely and second most likely classes are the smallest.
-
-    :param classifier:
-        The classifier for which the labels are to be queried.
-    :type classifier:
-        sklearn classifier object, for instance sklearn.ensemble.RandomForestClassifier
-
-    :param X:
-        The pool of samples to query from.
-    :type X:
-        numpy.ndarray of shape (n_samples, n_features)
-
-    :param n_instances:
-        Number of samples to be queried.
-    :type n_instances:
-        int
-
-    :param uncertainty_measure_kwargs:
-        Keyword arguments to be passed for the uncertainty measure function.
-    :type uncertainty_measure_kwargs:
-        keyword arguments
-
-    :returns:
-      - **query_idx** *(numpy.ndarray of shape (n_instances, ))* --
-        The indices of the instances from X chosen to be labelled.
-      - **X[query_idx]** *(numpy.ndarray of shape (n_instances, n_features))* --
-        The instances from X chosen to be labelled.
+    Margin sampling query strategy. Selects the instances where the difference between the first most likely and second
+    most likely classes are the smallest.
+
+    Args:
+        classifier: The classifier for which the labels are to be queried.
+        X: The pool of samples to query from.
+        n_instances: Number of samples to be queried.
+        **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty measure function.
+
+    Returns:
+        The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled.
     """
     margin = classifier_margin(classifier, X, **uncertainty_measure_kwargs)
     query_idx = multi_argmax(-margin, n_instances=n_instances)
 
     return query_idx, X[query_idx]
 
 
-def entropy_sampling(classifier, X, n_instances=1, **uncertainty_measure_kwargs):
+def entropy_sampling(classifier: BaseEstimator, X: modALinput,
+                     n_instances: int = 1, **uncertainty_measure_kwargs) -> Tuple[np.ndarray, modALinput]:
     """
-    Entropy sampling query strategy. Selects the instances where the class probabilities
-    have the largest entropy.
-
-    :param classifier:
-        The classifier for which the labels are to be queried.
-    :type classifier:
-        sklearn classifier object, for instance sklearn.ensemble.RandomForestClassifier
-
-    :param X:
-        The pool of samples to query from.
-    :type X:
-        numpy.ndarray of shape (n_samples, n_features)
-
-    :param n_instances:
-        Number of samples to be queried.
-    :type n_instances:
-        int
-
-    :param uncertainty_measure_kwargs:
-        Keyword arguments to be passed for the uncertainty measure function.
-    :type uncertainty_measure_kwargs:
-        keyword arguments
-
-    :returns:
-      - **query_idx** *(numpy.ndarray of shape (n_instances, ))* --
-        The indices of the instances from X chosen to be labelled.
-      - **X[query_idx]** *(numpy.ndarray of shape (n_instances, n_features))* --
-        The instances from X chosen to be labelled.
+    Entropy sampling query strategy. Selects the instances where the class probabilities have the largest entropy.
+
+    Args:
+        classifier: The classifier for which the labels are to be queried.
+        X: The pool of samples to query from.
+        n_instances: Number of samples to be queried.
+        **uncertainty_measure_kwargs: Keyword arguments to be passed for the uncertainty measure function.
+
+    Returns:
+        The indices of the instances from X chosen to be labelled; the instances from X chosen to be labelled.
     """
     entropy = classifier_entropy(classifier, X, **uncertainty_measure_kwargs)
     query_idx = multi_argmax(entropy, n_instances=n_instances)
diff --git a/modAL/utils/selection.py b/modAL/utils/selection.py
@@ -5,50 +5,35 @@
 import numpy as np
 
 
-def multi_argmax(values, n_instances=1):
+def multi_argmax(values: np.ndarray, n_instances: int = 1) -> np.ndarray:
     """
     Selects the indices of the n_instances highest values.
 
-    :param values:
-        Contains the values to be selected from.
-    :type values:
-        numpy.ndarray of shape = (n_samples, 1)
+    Args:
+        values: Contains the values to be selected from.
+        n_instances: Specifies how many indices to return.
 
-    :param n_instances:
-        Specifies how many indices to return.
-    :type n_instances:
-        int
-
-    :returns:
-      - **max_idx** *(numpy.ndarray of shape = (n_samples, 1))* --
+    Returns:
         Contains the indices of the n_instances largest values.
-
     """
-    assert n_instances <= len(values), 'n_instances must be less or equal than the size of utility'
+    assert n_instances <= values.shape[0], 'n_instances must be less or equal than the size of utility'
 
     max_idx = np.argpartition(-values, n_instances-1, axis=0)[:n_instances]
     return max_idx
 
 
-def weighted_random(weights, n_instances=1):
+def weighted_random(weights: np.ndarray, n_instances: int = 1) -> np.ndarray:
     """
     Returns n_instances indices based on the weights.
 
-    :param weights:
-        Contains the weights of the sampling.
-    :type weights:
-        numpy.ndarray of shape = (n_samples, 1)
-
-    :param n_instances:
-        Specifies how many indices to return.
-    :type n_instances:
-        int
+    Args:
+        weights: Contains the weights of the sampling.
+        n_instances: Specifies how many indices to return.
 
-    :returns:
-      - **random_idx** *(numpy.ndarray of shape = (n_instances, 1))* --
+    Returns:
         n_instances random indices based on the weights.
     """
-    assert n_instances <= len(weights), 'n_instances must be less or equal than the size of utility'
+    assert n_instances <= weights.shape[0], 'n_instances must be less or equal than the size of utility'
     weight_sum = np.sum(weights)
     assert weight_sum > 0, 'the sum of weights must be larger than zero'