Adding wrapper for supervised models to be used in shap

mmerce · mmerce · commit 074a9102e6b0 · 2023-07-10T23:02:32.000+02:00
diff --git a/bigml/constants.py b/bigml/constants.py
@@ -332,9 +332,11 @@
 OUT_NEW_HEADERS = "output_headers"
 
 # input data allowed formats in batch predictions
+NUMPY = "numpy"
 DATAFRAME = "dataframe"
 INTERNAL = "list_of_dicts"
 
+CATEGORICAL = "categorical"
 
 IMAGE_EXTENSIONS = ['png', 'jpg', 'jpeg', 'gif', 'tiff', 'tif', 'bmp',
                     'webp', 'cur', 'ico', 'pcx', 'psd', 'psb']
diff --git a/bigml/deepnet.py b/bigml/deepnet.py
@@ -68,14 +68,14 @@
     import tensorflow as tf
     tf.autograph.set_verbosity(0)
     LAMINAR_VERSION = False
-except ModuleNotFoundError:
+except Exception:
     LAMINAR_VERSION = True
 
 try:
     from sensenet.models.wrappers import create_model
     from bigml.images.utils import to_relative_coordinates
     from bigml.constants import IOU_REMOTE_SETTINGS
-except ModuleNotFoundError:
+except Exception:
     LAMINAR_VERSION = True
 
 LOGGER = logging.getLogger('BigML')
diff --git a/bigml/fields.py b/bigml/fields.py
@@ -45,14 +45,21 @@
 import json
 import csv
 import random
+import numpy as np
+
+try:
+    from pandas import DataFrame
+    PANDAS_READY = True
+except ImportError:
+    PANDAS_READY = False
 
 
 from bigml.util import invert_dictionary, python_map_type, find_locale
 from bigml.util import DEFAULT_LOCALE
 from bigml.api_handlers.resourcehandler import get_resource_type, get_fields
 from bigml.constants import (
     SOURCE_PATH, DATASET_PATH, SUPERVISED_PATHS, FUSION_PATH,
-    RESOURCES_WITH_FIELDS, DEFAULT_MISSING_TOKENS, REGIONS)
+    RESOURCES_WITH_FIELDS, DEFAULT_MISSING_TOKENS, REGIONS, CATEGORICAL)
 from bigml.io import UnicodeReader, UnicodeWriter
 
 LIST_LIMIT = 10
@@ -193,6 +200,32 @@ def get_new_fields(output_fields):
     return new_fields
 
 
+def one_hot_code(value, field, decode=False):
+    """Translating into codes categorical values. The codes are the index
+    of the value in the list of categories read from the fields summary.
+    Decode set to True will cause the code to be translated to the value"""
+
+    try:
+        categories = [cat[0] for cat in field["summary"]["categories"]]
+    except KeyError:
+        raise KeyError("Failed to find the categories list. Check the field"
+                       " information.")
+
+    if decode:
+        try:
+            result = categories[int(value)]
+        except KeyError:
+            raise KeyError("Code not found in the categories list. %s" %
+                           categories)
+    else:
+        try:
+            result = categories.index(value)
+        except ValueError:
+            raise ValueError("The '%s' value is not found in the categories "
+                             "list: %s" % (value, categories))
+    return result
+
+
 class Fields():
     """A class to deal with BigML auto-generated ids.
 
@@ -483,6 +516,77 @@ def stats(self, field_name):
         summary = self.fields[field_id].get('summary', {})
         return summary
 
+    def objective_field_info(self):
+        """Returns the fields structure for the objective field"""
+        if self.objective_field is None:
+            return None
+        objective_id = self.field_id(self.objective_field)
+        return {objective_id: self.fields[objective_id]}
+
+    def sorted_field_ids(self, objective=False):
+        """List of field IDs ordered by column number. If objective is
+        set to False, the objective field will be excluded.
+        """
+        fields = {}
+        fields.update(self.fields_by_column_number)
+        if not objective and self.objective_field is not None:
+            del(fields[self.objective_field])
+        field_ids = fields.values()
+        return field_ids
+
+    def to_numpy(self, input_data_list, objective=False):
+        """Transforming input data to numpy syntax. Fields are sorted
+        in the dataset order and categorical fields are one-hot encoded.
+        If objective set to False, the objective field will not be included"""
+        if PANDAS_READY and isinstance(input_data_list, DataFrame):
+            inner_data_list = input_data_list.to_dict('records')
+        else:
+            inner_data_list = input_data_list
+        field_ids = self.sorted_field_ids(objective=objective)
+        np_input_list = np.empty(shape=(len(input_data_list),
+                                        len(field_ids)))
+        for index, input_data in enumerate(inner_data_list):
+            np_input = np.array([])
+            for field_id in field_ids:
+                field_input = input_data.get(field_id,
+                    input_data.get(self.field_name(field_id)))
+                field = self.fields[field_id]
+                if field["optype"] == CATEGORICAL:
+                    field_input = one_hot_code(field_input, field)
+                np_input = np.append(np_input, field_input)
+            np_input_list[index] = np_input
+        return np_input_list
+
+    def from_numpy(self, np_data_list, objective=False, by_name=True):
+        """Transforming input data from numpy syntax. Fields are sorted
+        in the dataset order and categorical fields are one-hot encoded."""
+        input_data_list = []
+        field_ids = self.sorted_field_ids(objective=objective)
+        for np_data in np_data_list:
+            if len(np_data) != len(field_ids):
+                raise ValueError("Wrong number of features in data: %s"
+                " found, %s expected" % (len(np_data), len(field_ids)))
+            input_data = {}
+            for index, field_id in enumerate(field_ids):
+                field_input = None if np.isnan(np_data[index]) else \
+                    np_data[index]
+                field = self.fields[field_id]
+                if field["optype"] == CATEGORICAL:
+                    field_input = one_hot_code(field_input, field, decode=True)
+                if by_name:
+                    field_id = self.fields[field_id]["name"]
+                input_data.update({field_id: field_input})
+            input_data_list.append(input_data)
+        return input_data_list
+
+    def one_hot_codes(self, field_name):
+        """Returns the codes used for every category in a categorical field"""
+        field = self.fields[self.field_id(field_name)]
+        if field["optype"] != CATEGORICAL:
+            raise ValueError("Only categorical fields are encoded")
+        categories = [cat[0] for cat in field["summary"]["categories"]]
+        return dict(zip(categories, range(0, len(categories))))
+
     def summary_csv(self, filename=None):
         """Summary of the contents of the fields
 
diff --git a/bigml/modelfields.py b/bigml/modelfields.py
@@ -241,7 +241,7 @@ def add_terms(self, categories=False, numerics=False):
                     self.fields[field_id]["summary"]["categories"]:
                 self.categories[field_id] = [category for \
                     [category, _] in field['summary']['categories']]
-                del self.fields[field_id]["summary"]["categories"]
+                # del self.fields[field_id]["summary"]["categories"]
             if field['optype'] == 'datetime' and \
                     hasattr(self, "coeff_ids"):
                 self.coeff_id = [coeff_id for coeff_id in self.coeff_ids \
diff --git a/bigml/shapwrapper.py b/bigml/shapwrapper.py
@@ -0,0 +1,57 @@
+# -*- coding: utf-8 -*-
+# pylint: disable=super-init-not-called
+#
+# Copyright 2023 BigML
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+"""A wrapper for models to produce predictions as expected by Shap Explainer
+
+"""
+import numpy as np
+
+from bigml.supervised import SupervisedModel, extract_id
+from bigml.fields import Fields
+from bigml.api import get_resource_type, get_api_connection
+
+
+class ShapWrapper():
+    """ A lightweight wrapper around any supervised model that offers a
+    predict method adapted to the expected Shap Explainer syntax"""
+
+    def __init__(self, model, api=None, cache_get=None,
+                 operation_settings=None):
+
+        self.api = get_api_connection(api)
+        resource_id, model = extract_id(model, self.api)
+        resource_type = get_resource_type(resource_id)
+        self.local_model = SupervisedModel(model, api=api, cache_get=cache_get,
+            operation_settings=operation_settings)
+        objective_id = getattr(self.local_model, "objective_id", None)
+        self.fields = Fields(self.local_model.fields,
+                             objective_field=objective_id)
+        self.x_headers = [self.fields.field_name(field_id) for field_id in
+                          self.fields.sorted_field_ids()]
+        self.y_header = self.fields.field_name(self.fields.objective_field)
+
+    def predict(self, x_test, **kwargs):
+        """Prediction method that interfaces with the Shap library"""
+        input_data_list = self.fields.from_numpy(x_test)
+        batch_prediction = self.local_model.batch_predict(
+            input_data_list, outputs={"output_fields": ["prediction"],
+                                      "output_headers": [self.y_header]},
+            all_fields=False, **kwargs)
+        objective_field = self.fields.objective_field_info()
+        pred_fields = Fields(objective_field)
+        return pred_fields.to_numpy(batch_prediction,
+                                    objective=True).reshape(-1)
diff --git a/bigml/supervised.py b/bigml/supervised.py
@@ -170,7 +170,8 @@ def data_transformations(self):
         """
         return self.local_model.data_transformations()
 
-    def batch_predict(self, input_data_list, outputs=None, **kwargs):
+    def batch_predict(self, input_data_list, outputs=None, all_fields=True,
+                      **kwargs):
         """Creates a batch prediction for a list of inputs using the local
         supervised model. Allows to define some output settings to
         decide the fields to be added to the input_data (prediction,
@@ -185,6 +186,8 @@ def batch_predict(self, input_data_list, outputs=None, **kwargs):
         :type input_data_list: list or Panda's dataframe
         :param dict outputs: properties that define the headers and fields to
                              be added to the input data
+        :param boolean all_fields: whether all the fields in the input data
+                                   should be part of the response
         :return: the list of input data plus the predicted values
         :rtype: list or Panda's dataframe depending on the input type in
                 input_data_list
@@ -199,17 +202,22 @@ def batch_predict(self, input_data_list, outputs=None, **kwargs):
             new_headers = new_headers[0: len(new_fields)]
         data_format = get_data_format(input_data_list)
         inner_data_list = get_formatted_data(input_data_list, INTERNAL)
+        predictions_list = []
+        kwargs.update({"full": True})
         for input_data in inner_data_list:
-            kwargs.update({"full": True})
             prediction = self.predict(input_data, **kwargs)
+            prediction_data = {}
+            if all_fields:
+                prediction_data.update(input_data)
             for index, key in enumerate(new_fields):
                 try:
-                    input_data[new_headers[index]] = prediction[key]
+                    prediction_data[new_headers[index]] = prediction[key]
                 except KeyError:
                     pass
+            predictions_list.append(prediction_data)
         if data_format != INTERNAL:
-            return format_data(inner_data_list, out_format=data_format)
-        return inner_data_list
+            return format_data(predictions_list, out_format=data_format)
+        return predictions_list
 
     #pylint: disable=locally-disabled,arguments-differ
     def dump(self, **kwargs):
diff --git a/bigml/util.py b/bigml/util.py
@@ -723,6 +723,7 @@ def get_data_format(input_data_list):
     raise ValueError("Data is expected to be provided as a list of "
                      "dictionaries or Pandas' DataFrame.")
 
+
 #pylint: disable=locally-disabled,comparison-with-itself
 def format_data(input_data_list, out_format=None):
     """Transforms the input data format to the one expected """