Adding confidence to the output of local Fusions

mmerce · mmerce · commit fcaf51b98c08 · 2023-06-16T01:00:39.000+02:00
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -3,12 +3,16 @@
 History
 -------
 
+9.5.0 (2023-06-16)
+------------------
+
+- Extending Local Fusions output to include confidence.
+
 9.4.0 (2023-06-14)
 ------------------
 
 - Extending LocalModel class to handle Time Series locally.
 
-
 9.3.0 (2023-06-09)
 ------------------
 
diff --git a/bigml/deepnet.py b/bigml/deepnet.py
@@ -397,6 +397,8 @@ def predict(self, input_data, operating_point=None, operating_kind=None,
             if not isinstance(prediction, dict):
                 prediction = {"prediction": round(prediction, DECIMALS)}
             prediction.update({"unused_fields": unused_fields})
+            if "probability" in prediction:
+                prediction["confidence"] = prediction.get("probability")
         else:
             if isinstance(prediction, dict):
                 prediction = prediction["prediction"]
@@ -489,6 +491,16 @@ def predict_probability(self, input_data, compact=False):
             return [category['probability'] for category in distribution]
         return distribution
 
+    def predict_confidence(self, input_data, compact=False):
+        """Uses probability as a confidence
+        """
+        if compact or self.regression:
+            return self.predict_probability(input_data, compact=compact)
+        return [{"category": pred["category"],
+                 "confidence": pred["probability"]}
+                for pred in self.predict_probability(input_data,
+                                                     compact=compact)]
+
     #pylint: disable=locally-disabled,invalid-name
     def _sort_predictions(self, a, b, criteria):
         """Sorts the categories in the predicted node according to the
@@ -516,6 +528,8 @@ def predict_operating_kind(self, input_data, operating_kind=None):
         prediction = predictions[0]
         prediction["prediction"] = prediction["category"]
         del prediction["category"]
+        if "probability" in prediction:
+            prediction["confidence"] = prediction.get("probability")
         return prediction
 
     def predict_operating(self, input_data, operating_point=None):
@@ -543,6 +557,8 @@ def predict_operating(self, input_data, operating_point=None):
                 prediction = prediction[0]
         prediction["prediction"] = prediction["category"]
         del prediction["category"]
+        if "probability" in prediction:
+            prediction["confidence"] = prediction.get("probability")
         return prediction
 
     def data_transformations(self):
diff --git a/bigml/ensemble.py b/bigml/ensemble.py
@@ -860,6 +860,8 @@ def predict(self, input_data, method=None,
                     set(prediction.get("unused_fields", [])))
             if not isinstance(result, dict):
                 result = {"prediction": round(result, DECIMALS)}
+            if "probability" in result and "confidence" not in result:
+                result["confidence"] = result["probability"]
             result['unused_fields'] = list(unused_fields)
 
         return result
diff --git a/bigml/fusion.py b/bigml/fusion.py
@@ -51,6 +51,7 @@
 from bigml.multivotelist import MultiVoteList
 from bigml.util import cast, check_no_missing_numerics, use_cache, load, \
     dump, dumps, NUMERIC
+from bigml.constants import DECIMALS
 from bigml.supervised import SupervisedModel
 from bigml.modelfields import ModelFields
 from bigml.tree_utils import add_distribution
@@ -248,7 +249,7 @@ def predict_probability(self, input_data,
         each possible output class, based on input values.  The input
         fields must be a dictionary keyed by field name or field ID.
 
-        For regressions, the output is a single element list
+        For regressions, the output is a single element
         containing the prediction.
 
         :param input_data: Input data to be predicted
@@ -264,6 +265,7 @@ def predict_probability(self, input_data,
         if not self.missing_numerics:
             check_no_missing_numerics(input_data, self.model_fields)
 
+        weights = []
         for models_split in self.models_splits:
             models = []
             for model in models_split:
@@ -287,35 +289,34 @@ def predict_probability(self, input_data,
                     continue
                 if self.regression:
                     prediction = prediction[0]
-                    if self.weights is not None:
-                        prediction = self.weigh(prediction, model.resource_id)
-                else:
-                    if self.weights is not None:
-                        prediction = self.weigh( \
-                            prediction, model.resource_id)
-                    # we need to check that all classes in the fusion
-                    # are also in the composing model
-                    if not self.regression and \
-                            self.class_names != model.class_names:
-                        try:
-                            prediction = rearrange_prediction( \
-                                model.class_names,
-                                self.class_names,
-                                prediction)
-                        except AttributeError:
-                            # class_names should be defined, but just in case
-                            pass
+                if self.weights is not None:
+                    weights.append(1 if not self.weights else self.weights[
+                        self.model_ids.index(model.resource_id)])
+                    prediction = self.weigh(prediction, model.resource_id)
+                # we need to check that all classes in the fusion
+                # are also in the composing model
+                if not self.regression and \
+                        self.class_names != model.class_names:
+                    try:
+                        prediction = rearrange_prediction( \
+                            model.class_names,
+                            self.class_names,
+                            prediction)
+                    except AttributeError:
+                        # class_names should be defined, but just in case
+                        pass
                 votes_split.append(prediction)
             votes.extend(votes_split)
         if self.regression:
-            total_weight = len(votes.predictions) if self.weights is None \
-                else sum(self.weights)
-            prediction = sum(votes.predictions) / float(total_weight)
+            prediction = 0
+            total_weight = sum(weights)
+            for index, pred in enumerate(votes.predictions):
+                prediction += pred # the weight is already considered in pred
+            prediction /= float(total_weight)
             if compact:
                 output = [prediction]
             else:
                 output = {"prediction": prediction}
-
         else:
             output = votes.combine_to_distribution(normalize=True)
             if not compact:
@@ -326,6 +327,97 @@ def predict_probability(self, input_data,
 
         return output
 
+    def predict_confidence(self, input_data,
+                           missing_strategy=LAST_PREDICTION,
+                           compact=False):
+
+        """For classification models, Predicts a confidence for
+        each possible output class, based on input values.  The input
+        fields must be a dictionary keyed by field name or field ID.
+
+        For regressions, the output is a single element
+        containing the prediction and the associated confidence.
+
+        WARNING: Only decision-tree based models in the Fusion object will
+        have an associated confidence, so the result for fusions that don't
+        contain such models can be None.
+
+        :param input_data: Input data to be predicted
+        :param missing_strategy: LAST_PREDICTION|PROPORTIONAL missing strategy
+                                 for missing fields
+        :param compact: If False, prediction is returned as a list of maps, one
+                        per class, with the keys "prediction" and "confidence"
+                        mapped to the name of the class and it's confidence,
+                        respectively.  If True, returns a list of confidences
+                        ordered by the sorted order of the class names.
+        """
+        if not self.missing_numerics:
+            check_no_missing_numerics(input_data, self.model_fields)
+
+        predictions = []
+        weights = []
+        for models_split in self.models_splits:
+            models = []
+            for model in models_split:
+                model_type = get_resource_type(model)
+                if model_type == "fusion":
+                    models.append(Fusion(model, api=self.api))
+                else:
+                    models.append(SupervisedModel(model, api=self.api))
+            votes_split = []
+            for model in models:
+                try:
+                    kwargs = {"compact": False}
+                    if model_type in ["model", "ensemble", "fusion"]:
+                        kwargs.update({"missing_strategy": missing_strategy})
+                    prediction = model.predict_confidence( \
+                        input_data, **kwargs)
+                except Exception as exc:
+                    # logistic regressions can raise this error if they
+                    # have missing_numerics=False and some numeric missings
+                    # are found and Linear Regressions have no confidence
+                    continue
+                predictions.append(prediction)
+                weights.append(1 if not self.weights else self.weights[
+                    self.model_ids.index(model.resource_id)])
+                if self.regression:
+                    prediction = prediction["prediction"]
+        if self.regression:
+            prediction = 0
+            confidence = 0
+            total_weight = sum(weights)
+            for index, pred in enumerate(predictions):
+                prediction += pred.get("prediction")  * weights[index]
+                confidence += pred.get("confidence")
+            prediction /= float(total_weight)
+            confidence /= float(len(predictions))
+            if compact:
+                output = [prediction, confidence]
+            else:
+                output = {"prediction": prediction, "confidence": confidence}
+        else:
+            output = self._combine_confidences(predictions)
+            if not compact:
+                output = [{'category': class_name,
+                           'confidence': confidence}
+                          for class_name, confidence in
+                          zip(self.class_names, output)]
+        return output
+
+    def _combine_confidences(self, predictions):
+        """Combining the confidences per class of classification models"""
+        output = []
+        count = float(len(predictions))
+        for class_name in self.class_names:
+            confidence = 0
+            for prediction in predictions:
+                for category_info in prediction:
+                    if category_info["category"] == class_name:
+                        confidence += category_info.get("confidence")
+                        break
+            output.append(round(confidence / count, DECIMALS))
+        return output
+
     def weigh(self, prediction, model_id):
         """Weighs the prediction according to the weight associated to the
         current model in the fusion.
@@ -421,16 +513,28 @@ def _predict(self, input_data, missing_strategy=LAST_PREDICTION,
                 missing_strategy=missing_strategy,
                 operating_point=operating_point)
             return prediction
-
         result = self.predict_probability( \
             input_data,
             missing_strategy=missing_strategy,
             compact=False)
+        confidence_result = self.predict_confidence( \
+            input_data,
+            missing_strategy=missing_strategy,
+            compact=False)
 
         if not self.regression:
+            try:
+                for index, value in enumerate(result):
+                    result[index].update(
+                        {"confidence": confidence_result[index]["confidence"]})
+            except Exception as exc:
+                pass
             result = sorted(result, key=lambda x: - x["probability"])[0]
             result["prediction"] = result["category"]
             del result["category"]
+        else:
+            result.update(
+                {"confidence": confidence_result["confidence"]})
 
         # adding unused fields, if any
         if unused_fields:
diff --git a/bigml/logistic.py b/bigml/logistic.py
@@ -264,6 +264,17 @@ def predict_probability(self, input_data, compact=False):
             return [category['probability'] for category in distribution]
         return distribution
 
+    def predict_confidence(self, input_data, compact=False):
+        """For logistic regressions we assume that probability can be used
+        as confidence.
+        """
+        if compact:
+            return self.predict_probability(input_data, compact=compact)
+        return [{"category": pred["category"],
+                 "confidence": pred["probability"]}
+                for pred in self.predict_probability(input_data,
+                                                     compact=compact)]
+
     def predict_operating(self, input_data,
                           operating_point=None):
         """Computes the prediction based on a user-given operating point.
@@ -290,6 +301,7 @@ def predict_operating(self, input_data,
                 prediction = prediction[0]
         prediction["prediction"] = prediction["category"]
         del prediction["category"]
+        prediction['confidence'] = prediction['probability']
         return prediction
 
     def predict_operating_kind(self, input_data,
@@ -310,6 +322,7 @@ def predict_operating_kind(self, input_data,
         prediction = predictions[0]
         prediction["prediction"] = prediction["category"]
         del prediction["category"]
+        prediction['confidence'] = prediction['probability']
         return prediction
 
     #pylint: disable=locally-disabled,consider-using-dict-items
@@ -422,7 +435,8 @@ def predict(self, input_data,
                              for category, probability in predictions]}
 
         if full:
-            result.update({'unused_fields': unused_fields})
+            result.update({'unused_fields': unused_fields, 'confidence':
+                           result['probability']})
         else:
             result = result["prediction"]
 
diff --git a/bigml/supervised.py b/bigml/supervised.py
@@ -154,6 +154,16 @@ def predict_probability(self, *args, **kwargs):
             del new_kwargs["missing_strategy"]
             return self.local_model.predict_probability(*args, **new_kwargs)
 
+    def predict_confidence(self, *args, **kwargs):
+        """Delegating method to local model object"""
+        new_kwargs = {}
+        new_kwargs.update(kwargs)
+        try:
+            return self.local_model.predict_confidence(*args, **new_kwargs)
+        except TypeError:
+            del new_kwargs["missing_strategy"]
+            return self.local_model.predict_confidence(*args, **new_kwargs)
+
     def data_transformations(self):
         """Returns the pipeline transformations previous to the modeling
         step as a pipeline, so that they can be used in local predictions.
diff --git a/bigml/tests/compare_predictions_steps.py b/bigml/tests/compare_predictions_steps.py
@@ -474,6 +474,14 @@ def the_local_probability_is(step, probability):
     eq_(local_probability, probability, precision=4)
 
 
+def the_local_confidence_is(step, confidence):
+    """Step: the local confidence is <confidence>"""
+    local_confidence = step.bigml["local_prediction"]["confidence"]
+    if isinstance(confidence, str):
+        confidence = float(confidence)
+    eq_(local_confidence, confidence, precision=4)
+
+
 def eq_local_and_remote_probability(step):
     """Step: check local and remote probability"""
     local_probability = round(step.bigml["local_prediction"]["probability"], 3)
diff --git a/bigml/tests/test_39_optiml_fusion.py b/bigml/tests/test_39_optiml_fusion.py
@@ -252,17 +252,18 @@ def test_scenario4(self):
             And I create a local fusion prediction for "<input_data>"
             Then the local fusion prediction is "<prediction>"
             And the local fusion probability for the prediction is "<probability>"
+            And the local fusion confidence for the prediction is "<confidence>"
         """
         show_doc(self.test_scenario4)
         headers = ["data", "source_wait", "dataset_wait", "model_wait",
                    "fusion_wait", "model_conf", "tag", "input_data",
-                   "objective_id", "prediction", "probability"]
+                   "objective_id", "prediction", "probability", "confidence"]
         examples = [
             ['data/iris.csv', '10', '10', '30', '30',
              '{"tags":["my_fusion_4_tag"], "missing_numerics": true}',
              'my_fusion_4_tag',
              '{"petal width": 1.75, "petal length": 2.45}', "000004",
-             "Iris-setosa", '0.4726']]
+             "Iris-setosa", '0.4726', '0.4726']]
         for example in examples:
             example = dict(zip(headers, example))
             show_method(self, self.bigml["method"], example)
@@ -299,6 +300,8 @@ def test_scenario4(self):
                 self, example["prediction"])
             compare_pred.the_local_probability_is(
                 self, example["probability"])
+            compare_pred.the_local_confidence_is(
+                self, example["confidence"])
 
     def test_scenario5(self):
         """
diff --git a/bigml/version.py b/bigml/version.py
@@ -1 +1 @@
-__version__ = '9.4.0'
+__version__ = '9.5.0'

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = '9.4.0'`
	`1`	`+__version__ = '9.5.0'`