Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ ttach
tqdm
opencv-python
matplotlib
scikit-learn
scikit-learn
transformers
168 changes: 168 additions & 0 deletions usage_examples/clip_example
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
import argparse

import cv2
import numpy as np
import torch
from torch import nn
from transformers import CLIPProcessor, CLIPModel


from pytorch_grad_cam import GradCAM, \
ScoreCAM, \
GradCAMPlusPlus, \
AblationCAM, \
XGradCAM, \
EigenCAM, \
EigenGradCAM, \
LayerCAM, \
FullGrad

from pytorch_grad_cam.utils.image import show_cam_on_image, \
preprocess_image
from pytorch_grad_cam.ablation_layer import AblationLayerVit


def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--use-cuda', action='store_true', default=False,
help='Use NVIDIA GPU acceleration')
parser.add_argument(
'--image-path',
type=str,
default='./examples/both.png',
help='Input image path')
parser.add_argument(
'--labels',
type=str,
nargs='+',
default=["a cat", "a dog", "a car", "a person", "a shoe"],
help='need recognition labels'
)

parser.add_argument('--aug_smooth', action='store_true',
help='Apply test time augmentation to smooth the CAM')
parser.add_argument(
'--eigen_smooth',
action='store_true',
help='Reduce noise by taking the first principle componenet'
'of cam_weights*activations')

parser.add_argument(
'--method',
type=str,
default='gradcam',
help='Can be gradcam/gradcam++/scorecam/xgradcam/ablationcam')

args = parser.parse_args()
args.use_cuda = args.use_cuda and torch.cuda.is_available()
if args.use_cuda:
print('Using GPU for acceleration')
else:
print('Using CPU for computation')

return args


def reshape_transform(tensor, height=16, width=16):
result = tensor[:, 1:, :].reshape(tensor.size(0),
height, width, tensor.size(2))

# Bring the channels to the first dimension,
# like in CNNs.
result = result.transpose(2, 3).transpose(1, 2)
return result


class ImageClassifier(nn.Module):
def __init__(self, labels):
super(ImageClassifier, self).__init__()
self.clip = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
self.labels = labels

def forward(self, x):
text_inputs = self.processor(text=labels, return_tensors="pt", padding=True)

outputs = self.clip(pixel_values=x, input_ids=text_inputs['input_ids'], attention_mask=text_inputs['attention_mask'])

logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)

for label, prob in zip(self.labels, probs[0]):
print(f"{label}: {prob:.4f}")
return probs


if __name__ == '__main__':
""" python vit_gradcam.py --image-path <path_to_image>
Example usage of using cam-methods on a VIT network.

"""

args = get_args()
methods = \
{"gradcam": GradCAM,
"scorecam": ScoreCAM,
"gradcam++": GradCAMPlusPlus,
"ablationcam": AblationCAM,
"xgradcam": XGradCAM,
"eigencam": EigenCAM,
"eigengradcam": EigenGradCAM,
"layercam": LayerCAM,
"fullgrad": FullGrad}

if args.method not in list(methods.keys()):
raise Exception(f"method should be one of {list(methods.keys())}")

labels = args.labels
model = ImageClassifier(labels)
if args.use_cuda:
model.cuda()
model.eval()
print(model)

target_layers = [model.clip.vision_model.encoder.layers[-1].layer_norm1]

if args.method not in methods:
raise Exception(f"Method {args.method} not implemented")

if args.use_cuda:
model = model.cuda()

rgb_img = cv2.imread(args.image_path, 1)[:, :, ::-1]
rgb_img = cv2.resize(rgb_img, (224, 224))
rgb_img = np.float32(rgb_img) / 255
input_tensor = preprocess_image(rgb_img, mean=[0.5, 0.5, 0.5],
std=[0.5, 0.5, 0.5])

if args.method == "ablationcam":
cam = methods[args.method](model=model,
target_layers=target_layers,
reshape_transform=reshape_transform,
ablation_layer=AblationLayerVit())
else:
cam = methods[args.method](model=model,
target_layers=target_layers,
reshape_transform=reshape_transform)



# If None, returns the map for the highest scoring category.
# Otherwise, targets the requested category.
targets = None
print(input_tensor.shape)

# AblationCAM and ScoreCAM have batched implementations.
# You can override the internal batch size for faster computation.
cam.batch_size = 32

grayscale_cam = cam(input_tensor=input_tensor,
targets=targets,
eigen_smooth=args.eigen_smooth,
aug_smooth=args.aug_smooth)

# Here grayscale_cam has only one image in the batch
grayscale_cam = grayscale_cam[0, :]

cam_image = show_cam_on_image(rgb_img, grayscale_cam)
cv2.imwrite(f'{args.method}_cam.jpg', cam_image)
Loading