transformer interpreters update

holyseven · holyseven · commit 7b2a36762456 · 2022-11-04T16:22:21.000+08:00
diff --git a/interpretdl/interpreter/abc_interpreter.py b/interpretdl/interpreter/abc_interpreter.py
@@ -385,8 +385,15 @@ def __init__(self, paddle_model: callable, device: str, use_cuda: bool = None, *
             "paddle_model has to be " \
             "an instance of paddle.nn.Layer or a compatible one."
 
-    def _build_predict_fn(self, rebuild: bool = False, embedding_name: str or None = None, attn_map_name: str or None = None, 
-                                 attn_v_name: str or None = None, attn_proj_name: str or None = None, nlp: bool = False):
+    def _build_predict_fn(
+            self, 
+            rebuild: bool = False, 
+            embedding_name: str or None = None, 
+            attn_map_name: str or None = None, 
+            attn_v_name: str or None = None, 
+            attn_proj_name: str or None = None, 
+            gradient_of: str or None = None,
+            nlp: bool = False):
         
         """Build ``predict_fn`` for transformer based algorithms.
         The model is supposed to be a classification model.
@@ -463,7 +470,7 @@ def block_value_hook(layer, input, output):
                     if attn_map_name is not None and re.match(attn_map_name, n):
                         h = v.register_forward_post_hook(block_attn_hook)
                         hooks.append(h)
-                    elif scale is not None and re.match(embedding_name, n):
+                    elif scale is not None and embedding_name is not None and re.match(embedding_name, n):
                         h = v.register_forward_post_hook(hook)
                         hooks.append(h)
                     elif attn_proj_name is not None and re.match(attn_proj_name, n):
@@ -474,21 +481,28 @@ def block_value_hook(layer, input, output):
                         h = v.register_forward_post_hook(block_value_hook)
                         hooks.append(h)
                 
-                out = self.paddle_model(*inputs)
+                logits = self.paddle_model(*inputs)
                 
                 for h in hooks:
                     h.remove()
 
-                proba = paddle.nn.functional.softmax(out, axis=1)
+                proba = paddle.nn.functional.softmax(logits, axis=1)
                 preds = paddle.argmax(proba, axis=1)
                 if label is None:
                     label = preds.numpy()
+                label_onehot = paddle.nn.functional.one_hot(paddle.to_tensor(label), num_classes=logits.shape[1])
 
                 block_attns_grads = []
-                
-                label_onehot = paddle.nn.functional.one_hot(paddle.to_tensor(label), num_classes=proba.shape[1])
-                target = paddle.sum(proba * label_onehot, axis=1)
-                target.backward()
+
+                if gradient_of == 'probability' or gradient_of is None:
+                    target = paddle.sum(proba * label_onehot, axis=1)
+                    target.backward()
+                elif gradient_of == 'logit':
+                    target = paddle.sum(logits * label_onehot, axis=1)
+                    target.backward()
+                else:
+                    raise ValueError("`gradient_of` should be one of [logits, probability].")
+
                 for i, attn in enumerate(block_attns):
                     grad = attn.grad.numpy()
                     block_attns_grads.append(grad)
diff --git a/interpretdl/interpreter/bidirectional_transformer.py b/interpretdl/interpreter/bidirectional_transformer.py
@@ -167,7 +167,7 @@ def interpret(self,
                   ap_mode: str = "head",
                   start_layer: int = 11,
                   steps: int = 20,
-                  embedding_name='^[a-z]*.embeddings.word_embeddings$', 
+                  embedding_name='^[a-z]*.embeddings$', 
                   attn_map_name='^[a-z]*.encoder.layers.[0-9]*.self_attn.attn_drop$', 
                   attn_v_name='^[a-z]*.encoder.layers.[0-9]*.self_attn.v_proj$',
                   attn_proj_name='^[a-z]*.encoder.layers.[0-9]*.self_attn.out_proj$',
@@ -182,7 +182,7 @@ def interpret(self,
             start_layer (int, optional): Compute the state from the start layer. Default: ``11``.
             steps (int, optional): number of steps in the Riemann approximation of the integral. Default: ``20``.
             embedding_name (str, optional): The layer name for embedding, head-wise/token-wise.
-                Default: ``^ernie.embeddings.word_embeddings$``.
+                Default: ``^ernie.embeddings$``.
             attn_map_name (str, optional): The layer name to obtain the attention weights, head-wise/token-wise.
                 Default: ``^ernie.encoder.layers.*.self_attn.attn_drop$``.
             attn_v_name (str, optional): The layer name for value projection, token-wise.
@@ -216,7 +216,8 @@ def text_to_input_fn(raw_text):
             model_input = tuple(model_input, )
 
         self._build_predict_fn(embedding_name=embedding_name, attn_map_name=attn_map_name, 
-                               attn_v_name=attn_v_name, attn_proj_name=attn_proj_name, nlp=True)
+                               attn_v_name=attn_v_name, attn_proj_name=attn_proj_name, 
+                               gradient_of='logit')
         
         attns, grads, inputs, values, projs, proba, preds = self.predict_fn(model_input)
         assert start_layer < len(attns), "start_layer should be in the range of [0, num_block-1]"
@@ -269,6 +270,8 @@ def text_to_input_fn(raw_text):
         # intermediate results, for possible further usages.
         self.predicted_label = preds
         self.predicted_proba = proba
+        self.ap = R[:, 0, :]
+        self.rf = grad_head_mean[:, 0, :]
 
         if visual:
             # TODO: visualize if tokenizer is given.
diff --git a/interpretdl/interpreter/generic_attention.py b/interpretdl/interpreter/generic_attention.py
@@ -234,7 +234,6 @@ def interpret(self,
                   text_to_input_fn: callable = None,
                   label: int or None = None,
                   start_layer: int = 11,
-                  embedding_name='^[a-z]*.embeddings.word_embeddings$',
                   attn_map_name='^[a-z]*.encoder.layers.[0-9]*.self_attn.attn_drop$',
                   max_seq_len=128,
                   visual=False):
@@ -246,8 +245,6 @@ def interpret(self,
             label (list or tuple or numpy.ndarray, optional): The target labels to analyze. The number of labels
                 should be equal to the number of texts. If None, the most likely label for each text will be used.
                 Default: ``None``.
-            embedding_name (str, optional): The layer name for word embedding.
-                Default: ``^ernie.embeddings.word_embeddings$``.
             attn_map_name (str, optional): The layer name to obtain attention weights.
                 Default: ``^ernie.encoder.layers.*.self_attn.attn_drop$``
 
@@ -272,7 +269,7 @@ def text_to_input_fn(raw_text):
             model_input = tuple(inp for inp in model_input)
         else:
             model_input = tuple(model_input, )
-        self._build_predict_fn(embedding_name=embedding_name, attn_map_name=attn_map_name, nlp=True)
+        self._build_predict_fn(attn_map_name=attn_map_name, gradient_of='logit', nlp=True)
 
         attns, grads, inputs, values, projs, proba, preds = self.predict_fn(model_input)
         assert start_layer < len(attns), "start_layer should be in the range of [0, num_block-1]"