google
diff --git a/‎compiler_opt/tools/feature_importance.py
Lines changed: 140 additions & 0 deletions b/‎compiler_opt/tools/feature_importance.py
Lines changed: 140 additions & 0 deletions
diff --git a/‎compiler_opt/tools/feature_importance_graphs.py
Lines changed: 76 additions & 0 deletions b/‎compiler_opt/tools/feature_importance_graphs.py
Lines changed: 76 additions & 0 deletions
@@ -0,0 +1,140 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A tool for analyzing which features a model uses to make a decision.
+
+This script allows for processing a set of examples generated from a trace
+created through generate_default_trace into a set of shap values which
+represent how much that specific feature contributes to the final output of
+the model. These values can then be imported into an IPython notebook and
+graphed with the help of the feature_importance_graphs.py module in the same
+folder.
+
+Usage:
+PYTHONPATH=$PYTHONPATH:. python3 compiler_opt/tools/feature_importance.py \
+    --gin_files=compiler_opt/rl/regalloc/gin_configs/common.gin \
+    --gin_bindings=config_registry.get_configuration.implementation=\
+      @configs.RegallocEvictionConfig \
+    --data_path=/default_trace \
+    --model_path=/warmstart/saved_policy \
+    --num_examples=5 \
+    --output_file=./explanation_data.json
+
+The type of trace that is performed (ie if it is just tracing the default
+heuristic or if it is a trace of a ML model) doesn't matter as the only data
+that matters re the input features. The num_examples flag sets the number of
+examples that get processed into shap values. Increasing this value will
+potentially allow you to reach better conclusions depending upon how you're
+viewing the data, but increasing it will also increase the runtime of this
+script quite significantly as the process is not multithreaded.
+"""
+
+from absl import app
+from absl import flags
+from absl import logging
+import gin
+
+from compiler_opt.rl import data_reader
+from compiler_opt.rl import constant
+from compiler_opt.rl import registry
+
+from compiler_opt.tools import feature_importance_utils
+
+import tensorflow as tf
+import shap
+import numpy
+import numpy.typing
+import json
+
+_DATA_PATH = flags.DEFINE_multi_string(
+    'data_path', [], 'Path to TFRecord file(s) containing trace data.')
+_MODEL_PATH = flags.DEFINE_string('model_path', '',
+                                  'Path to the model to explain')
+_OUTPUT_FILE = flags.DEFINE_string(
+    'output_file', '', 'The path to the output file containing the SHAP values')
+_NUM_EXAMPLES = flags.DEFINE_integer(
+    'num_examples', 1, 'The number of examples to process from the trace')
+_GIN_FILES = flags.DEFINE_multi_string(
+    'gin_files', [], 'List of paths to gin configuration files.')
+_GIN_BINDINGS = flags.DEFINE_multi_string(
+    'gin_bindings', [],
+    'Gin bindings to override the values set in the config files.')
+
+
+def main(_):
+  gin.parse_config_files_and_bindings(
+      _GIN_FILES.value, bindings=_GIN_BINDINGS.value, skip_unknown=False)
+  logging.info(gin.config_str())
+
+  problem_config = registry.get_configuration()
+  time_step_spec, action_spec = problem_config.get_signature_spec()
+
+  tfrecord_dataset_fn = data_reader.create_tfrecord_dataset_fn(
+      agent_name=constant.AgentName.BEHAVIORAL_CLONE,
+      time_step_spec=time_step_spec,
+      action_spec=action_spec,
+      batch_size=1,
+      train_sequence_length=1)
+
+  dataset_iter = iter(tfrecord_dataset_fn(_DATA_PATH.value).repeat())
+
+  raw_trajectory = next(dataset_iter)
+
+  saved_policy = tf.saved_model.load(_MODEL_PATH.value)
+  action_fn = saved_policy.signatures['action']
+
+  observation = feature_importance_utils.process_raw_trajectory(raw_trajectory)
+  input_sig = feature_importance_utils.get_input_signature(observation)
+
+  run_model = feature_importance_utils.create_run_model_function(
+      action_fn, input_sig)
+
+  total_size = feature_importance_utils.get_signature_total_size(input_sig)
+  flattened_input = feature_importance_utils.flatten_input(
+      observation, total_size)
+  flattened_input = numpy.expand_dims(flattened_input, axis=0)
+  dataset = numpy.empty((_NUM_EXAMPLES.value, total_size))
+  for i in range(0, _NUM_EXAMPLES.value):
+    raw_trajectory = next(dataset_iter)
+    observation = feature_importance_utils.process_raw_trajectory(
+        raw_trajectory)
+    flat_input = feature_importance_utils.flatten_input(observation, total_size)
+    dataset[i] = flat_input
+
+  explainer = shap.KernelExplainer(run_model, numpy.zeros((1, total_size)))
+  shap_values = explainer.shap_values(dataset, nsamples=1000)
+  processed_shap_values = feature_importance_utils.collapse_values(
+      input_sig, shap_values, _NUM_EXAMPLES.value)
+
+  # if we have more than one value per feature, just set the dataset to zeros
+  # as summing across a dimension produces data that doesn't really mean
+  # anything
+  if feature_importance_utils.get_max_part_size(input_sig) > 1:
+    dataset = numpy.zeros(processed_shap_values.shape)
+
+  feature_names = list(input_sig.keys())
+
+  output_file_data = {
+      'expected_values': explainer.expected_value,
+      'shap_values': processed_shap_values.tolist(),
+      'data': dataset.tolist(),
+      'feature_names': feature_names
+  }
+
+  with open(_OUTPUT_FILE.value, 'w', encoding='utf-8') as output_file:
+    json.dump(output_file_data, output_file)
+
+
+if __name__ == '__main__':
+  app.run(main)
@@ -0,0 +1,76 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A module that allows for easily graphing feature importance data in
+notebooks"""
+
+import numpy
+import numpy.typing
+import shap
+import json
+
+from typing import Dict, List, Union, Optional
+
+DataType = Dict[str, Union[numpy.typing.ArrayLike, List[str]]]
+
+
+def load_shap_values(file_name: str) -> DataType:
+  """Loads a set of shap values created with the feature_importance.py script
+  into a dictionary that can then be used for creating graphs
+
+  Args:
+    file_name: The name of the file in which the shap values are stored. What
+      the --output_path flag was set to in the feature importance script.
+  """
+  with open(file_name, encoding='utf-8') as file_to_load:
+    data = json.load(file_to_load)
+    if data['expected_values'] is not list:
+      data['expected_values'] = [data['expected_values']]
+    return {
+        'expected_values': numpy.asarray(data['expected_values']),
+        'shap_values': numpy.asarray(data['shap_values']),
+        'data': numpy.asarray(data['data']),
+        'feature_names': data['feature_names']
+    }
+
+
+def init_shap_for_notebook():
+  """Initalizes some JS code for interactive feature importance plots."""
+  shap.initjs()
+
+
+def graph_individual_example(data: DataType, index: Optional[int]):
+  """Creates a force plot for an example
+
+  Args:
+    data: An object containing all the shap values and other information
+      necessary to create the plot. Should be created with load_shap_values.
+    index: The index of the example that you wish to plot.
+  """
+  return shap.force_plot(
+      data['expected_values'],
+      data['shap_values'][index, :],
+      data['data'][index, :],
+      feature_names=data['feature_names'])
+
+
+def graph_summary_plot(data: DataType):
+  """Creates a summary plot of the entire dataset given
+
+  Args:
+    data: An object containing all the shap values necessary to create the
+      plot. Should come from load_shap_values
+  """
+  return shap.summary_plot(
+      data['shap_values'], data['data'], feature_names=data['feature_names'])