diff --git a/requirements.txt b/requirements.txt index 1451e880..ae14dcc1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,5 @@ ttach tqdm opencv-python matplotlib -scikit-learn \ No newline at end of file +scikit-learn +transformers \ No newline at end of file diff --git a/usage_examples/clip_example b/usage_examples/clip_example new file mode 100644 index 00000000..9339d171 --- /dev/null +++ b/usage_examples/clip_example @@ -0,0 +1,168 @@ +import argparse + +import cv2 +import numpy as np +import torch +from torch import nn +from transformers import CLIPProcessor, CLIPModel + + +from pytorch_grad_cam import GradCAM, \ + ScoreCAM, \ + GradCAMPlusPlus, \ + AblationCAM, \ + XGradCAM, \ + EigenCAM, \ + EigenGradCAM, \ + LayerCAM, \ + FullGrad + +from pytorch_grad_cam.utils.image import show_cam_on_image, \ + preprocess_image +from pytorch_grad_cam.ablation_layer import AblationLayerVit + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--use-cuda', action='store_true', default=False, + help='Use NVIDIA GPU acceleration') + parser.add_argument( + '--image-path', + type=str, + default='./examples/both.png', + help='Input image path') + parser.add_argument( + '--labels', + type=str, + nargs='+', + default=["a cat", "a dog", "a car", "a person", "a shoe"], + help='need recognition labels' + ) + + parser.add_argument('--aug_smooth', action='store_true', + help='Apply test time augmentation to smooth the CAM') + parser.add_argument( + '--eigen_smooth', + action='store_true', + help='Reduce noise by taking the first principle componenet' + 'of cam_weights*activations') + + parser.add_argument( + '--method', + type=str, + default='gradcam', + help='Can be gradcam/gradcam++/scorecam/xgradcam/ablationcam') + + args = parser.parse_args() + args.use_cuda = args.use_cuda and torch.cuda.is_available() + if args.use_cuda: + print('Using GPU for acceleration') + else: + print('Using CPU for computation') + + return args + + +def reshape_transform(tensor, height=16, width=16): + result = tensor[:, 1:, :].reshape(tensor.size(0), + height, width, tensor.size(2)) + + # Bring the channels to the first dimension, + # like in CNNs. + result = result.transpose(2, 3).transpose(1, 2) + return result + + +class ImageClassifier(nn.Module): + def __init__(self, labels): + super(ImageClassifier, self).__init__() + self.clip = CLIPModel.from_pretrained("openai/clip-vit-large-patch14") + self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14") + self.labels = labels + + def forward(self, x): + text_inputs = self.processor(text=labels, return_tensors="pt", padding=True) + + outputs = self.clip(pixel_values=x, input_ids=text_inputs['input_ids'], attention_mask=text_inputs['attention_mask']) + + logits_per_image = outputs.logits_per_image + probs = logits_per_image.softmax(dim=1) + + for label, prob in zip(self.labels, probs[0]): + print(f"{label}: {prob:.4f}") + return probs + + +if __name__ == '__main__': + """ python vit_gradcam.py --image-path + Example usage of using cam-methods on a VIT network. + + """ + + args = get_args() + methods = \ + {"gradcam": GradCAM, + "scorecam": ScoreCAM, + "gradcam++": GradCAMPlusPlus, + "ablationcam": AblationCAM, + "xgradcam": XGradCAM, + "eigencam": EigenCAM, + "eigengradcam": EigenGradCAM, + "layercam": LayerCAM, + "fullgrad": FullGrad} + + if args.method not in list(methods.keys()): + raise Exception(f"method should be one of {list(methods.keys())}") + + labels = args.labels + model = ImageClassifier(labels) + if args.use_cuda: + model.cuda() + model.eval() + print(model) + + target_layers = [model.clip.vision_model.encoder.layers[-1].layer_norm1] + + if args.method not in methods: + raise Exception(f"Method {args.method} not implemented") + + if args.use_cuda: + model = model.cuda() + + rgb_img = cv2.imread(args.image_path, 1)[:, :, ::-1] + rgb_img = cv2.resize(rgb_img, (224, 224)) + rgb_img = np.float32(rgb_img) / 255 + input_tensor = preprocess_image(rgb_img, mean=[0.5, 0.5, 0.5], + std=[0.5, 0.5, 0.5]) + + if args.method == "ablationcam": + cam = methods[args.method](model=model, + target_layers=target_layers, + reshape_transform=reshape_transform, + ablation_layer=AblationLayerVit()) + else: + cam = methods[args.method](model=model, + target_layers=target_layers, + reshape_transform=reshape_transform) + + + + # If None, returns the map for the highest scoring category. + # Otherwise, targets the requested category. + targets = None + print(input_tensor.shape) + + # AblationCAM and ScoreCAM have batched implementations. + # You can override the internal batch size for faster computation. + cam.batch_size = 32 + + grayscale_cam = cam(input_tensor=input_tensor, + targets=targets, + eigen_smooth=args.eigen_smooth, + aug_smooth=args.aug_smooth) + + # Here grayscale_cam has only one image in the batch + grayscale_cam = grayscale_cam[0, :] + + cam_image = show_cam_on_image(rgb_img, grayscale_cam) + cv2.imwrite(f'{args.method}_cam.jpg', cam_image)