jacobgil · sayakpaul · Apr 3, 2021 · Apr 3, 2021
diff --git a/vit_explain.ipynb b/vit_explain.ipynb
diff --git a/vit_grad_rollout.py b/vit_grad_rollout.py
@@ -6,6 +6,8 @@
 import numpy as np
 import cv2
 
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+
 def grad_rollout(attentions, gradients, discard_ratio):
     result = torch.eye(attentions[0].size(-1))
     with torch.no_grad():
@@ -57,10 +59,10 @@ def get_attention_gradient(self, module, grad_input, grad_output):
     def __call__(self, input_tensor, category_index):
         self.model.zero_grad()
         output = self.model(input_tensor)
-        category_mask = torch.zeros(output.size())
+        category_mask = torch.zeros(output.size()).to(DEVICE)
         category_mask[:, category_index] = 1
         loss = (output*category_mask).sum()
         loss.backward()
 
         return grad_rollout(self.attentions, self.attention_gradients,
-            self.discard_ratio)
+            self.discard_ratio)