diff --git a/automation/notebooks-table-data.csv b/automation/notebooks-table-data.csv index 506af50..bc819e9 100644 --- a/automation/notebooks-table-data.csv +++ b/automation/notebooks-table-data.csv @@ -13,4 +13,5 @@ MLP Mixer,architectures/mlp-mixer.ipynb,,https://arxiv.org/abs/2105.01601 GloVe Word Embeddings, data_exploration/glove-word-embeddings.ipynb,https://github.com/stanfordnlp/GloVe,https://nlp.stanford.edu/pubs/glove.pdf Vision Transformer (ViT),architectures/vit.ipynb,,https://arxiv.org/pdf/2010.11929 Multi-Head Attention, modules/multihead-self-attention.ipynb,,https://arxiv.org/abs/1706.03762 -ResNet,architectures/resnet.ipynb,,https://arxiv.org/abs/1512.03385 \ No newline at end of file +ResNet,architectures/resnet.ipynb,,https://arxiv.org/abs/1512.03385 +DINO,architectures/dino.ipynb,,https://arxiv.org/abs/2104.14294 diff --git a/notebooks/architectures/DINO.ipynb b/notebooks/architectures/DINO.ipynb new file mode 100644 index 0000000..0aaca3f --- /dev/null +++ b/notebooks/architectures/DINO.ipynb @@ -0,0 +1,440 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "" + ], + "metadata": { + "id": "Iy-fAH0K-iJC" + } + }, + { + "cell_type": "markdown", + "source": [ + "# **DINO: Emerging Properties in Self-Supervised Vision Transformers** " + ], + "metadata": { + "id": "hmK_SbYa_fSA" + } + }, + { + "cell_type": "markdown", + "source": [ + "The paper “DINO: Emerging Properties in Self-Supervised Vision Transformers” presents a novel approach to self-supervised learning using Vision Transformers (ViTs). In simple terms, the goal of the paper is to demonstrate how a model can learn useful representations (attention maps) of images without the need for labels, through a distillation technique." + ], + "metadata": { + "id": "25swm6eH-srf" + } + }, + { + "cell_type": "markdown", + "source": [ + "**why is DINO relevant?**\n", + "\n", + " - Self-supervision: The DINO method avoids reliance on large amounts of labeled data, which is useful in scenarios where labeling data is costly or complicated.\n", + "\n", + " - Vision Transformers: It uses ViTs, a powerful architecture for computer vision tasks, showing that these networks can be effectively trained unsupervised.\n", + "\n", + " - Emergent Properties: The model trained with DINO learns to capture high-level spatial structures and relationships in images. Surprisingly, it produces highly interpretable attention maps and accurate object locations without being explicitly trained to do so.\n", + "\n", + "\n" + ], + "metadata": { + "id": "cM7jkpbs_buN" + } + }, + { + "cell_type": "markdown", + "source": [ + "" + ], + "metadata": { + "id": "1g-cR112_8Ff" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WJB9P2mT94Jy" + }, + "outputs": [], + "source": [ + "%%capture\n", + "#@title **Install required packages**\n", + "\n", + "!pip install torchinfo" + ] + }, + { + "cell_type": "code", + "source": [ + "#@title **Importing libraries**\n", + "\n", + "from torchsummary import summary\n", + "import torchinfo\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "import torchvision.transforms as transforms\n", + "from torchvision.models import vit_b_16\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.preprocessing import StandardScaler\n", + "import numpy as np" + ], + "metadata": { + "id": "GWMebzLuAAuS" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Note: Not all dependencies have the __version__ method.\n", + "\n", + "print(torch.__version__)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9rCTVIL4ACpD", + "outputId": "630e8f90-47ec-4e70-8173-aa394cd98d24" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "2.5.1+cu121\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### ViT-Small/16 architecture code" + ], + "metadata": { + "id": "48zU4_8sAJRF" + } + }, + { + "cell_type": "code", + "source": [ + "class DINOHead(nn.Module):\n", + " \"\"\"\n", + " DINO projection head for self-supervised learning\n", + " \"\"\"\n", + " def __init__(self, in_dim, out_dim, use_bn=True, norm_last_layer=True, nlayers=3, hidden_dim=2048):\n", + " super().__init__()\n", + " nlayers = max(nlayers, 1)\n", + " if nlayers == 1:\n", + " self.mlp = nn.Linear(in_dim, out_dim)\n", + " else:\n", + " layers = [nn.Linear(in_dim, hidden_dim)]\n", + " if use_bn:\n", + " layers.append(nn.BatchNorm1d(hidden_dim))\n", + " layers.append(nn.GELU())\n", + "\n", + " for _ in range(nlayers - 2):\n", + " layers.append(nn.Linear(hidden_dim, hidden_dim))\n", + " if use_bn:\n", + " layers.append(nn.BatchNorm1d(hidden_dim))\n", + " layers.append(nn.GELU())\n", + "\n", + " layers.append(nn.Linear(hidden_dim, out_dim))\n", + " if norm_last_layer:\n", + " layers.append(nn.BatchNorm1d(out_dim, affine=False))\n", + "\n", + " self.mlp = nn.Sequential(*layers)\n", + "\n", + " def forward(self, x):\n", + " return self.mlp(x)\n", + "\n", + "\n", + "class DINO(nn.Module):\n", + " \"\"\"\n", + " DINO model with ViT-Small/16 as backbone\n", + " \"\"\"\n", + " def __init__(self,\n", + " num_classes=1000,\n", + " out_dim=65536,\n", + " use_bn_in_head=False,\n", + " norm_last_layer=True,\n", + " momentum=0.999,\n", + " temperature_student=0.1,\n", + " temperature_teacher=0.1,\n", + " center_momentum=0.9):\n", + " super().__init__()\n", + "\n", + " # Load pre-trained ViT Small 16 backbone\n", + " self.backbone = vit_b_16(pretrained=True)\n", + "\n", + " # Remove the classification head\n", + " self.backbone.heads = nn.Identity()\n", + "\n", + " # Feature dimension of ViT-Small/16\n", + " feature_dim = 384\n", + "\n", + " # Create student and teacher heads\n", + " self.student_head = DINOHead(\n", + " feature_dim,\n", + " out_dim,\n", + " use_bn=use_bn_in_head,\n", + " norm_last_layer=norm_last_layer\n", + " )\n", + "\n", + " self.teacher_head = DINOHead(\n", + " feature_dim,\n", + " out_dim,\n", + " use_bn=use_bn_in_head,\n", + " norm_last_layer=False\n", + " )\n", + "\n", + " # Freeze teacher head parameters\n", + " for param in self.teacher_head.parameters():\n", + " param.requires_grad = False\n", + "\n", + " # Define the momentum parameter for EMA update\n", + " self.momentum = momentum\n", + "\n", + " # Initialize center (C)\n", + " self.center = nn.Parameter(torch.zeros(out_dim), requires_grad=False)\n", + "\n", + " # Temperatures\n", + " self.temperature_student = temperature_student\n", + " self.temperature_teacher = temperature_teacher\n", + "\n", + " # Center momentum (for EMA update of the center)\n", + " self.center_momentum = center_momentum\n", + "\n", + " def update_teacher(self):\n", + " \"\"\"\n", + " Update teacher model with EMA (Exponential Moving Average)\n", + " \"\"\"\n", + " with torch.no_grad():\n", + " for student_params, teacher_params in zip(self.student_head.parameters(), self.teacher_head.parameters()):\n", + " teacher_params.data = self.momentum * teacher_params.data + (1. - self.momentum) * student_params.data\n", + "\n", + " def forward(self, x1, x2):\n", + " \"\"\"\n", + " Forward pass with two augmented views of the same image\n", + " \"\"\"\n", + " # Extract features from both augmented views\n", + " z1 = self.backbone(x1)\n", + " z2 = self.backbone(x2)\n", + "\n", + " # Project features through student head\n", + " p1 = self.student_head(z1)\n", + " p2 = self.student_head(z2)\n", + "\n", + " # Detach teacher projections (do not compute gradients for teacher)\n", + " with torch.no_grad():\n", + " t1 = F.normalize(self.teacher_head(self.backbone(x1)), dim=-1)\n", + " t2 = F.normalize(self.teacher_head(self.backbone(x2)), dim=-1)\n", + "\n", + " return p1, p2, t1, t2\n", + "\n", + " def update_center(self, t1, t2):\n", + " \"\"\"\n", + " Update the center of the representations using EMA (Exponential Moving Average)\n", + " \"\"\"\n", + " with torch.no_grad():\n", + " # Concatenate teacher outputs and compute the mean\n", + " center_update = torch.cat([t1, t2]).mean(dim=0)\n", + " self.center.data = self.center_momentum * self.center.data + (1. - self.center_momentum) * center_update\n", + "\n", + "\n", + "\n", + "def dino_loss(student_output, teacher_output, temperature=0.1):\n", + " \"\"\"\n", + " DINO loss function (cross-entropy between student and teacher outputs)\n", + " \"\"\"\n", + " # Apply log softmax to student output and softmax to teacher output\n", + " student_output = F.log_softmax(student_output / temperature, dim=-1)\n", + " teacher_output = F.softmax(teacher_output / temperature, dim=-1)\n", + "\n", + " # Compute the DINO loss (cross-entropy)\n", + " loss = torch.sum(-teacher_output * student_output, dim=-1).mean()\n", + " return loss\n", + "\n", + "\n", + "def prepare_dino_transforms():\n", + " \"\"\"\n", + " Prepare data augmentations for DINO\n", + " \"\"\"\n", + " train_transform = transforms.Compose([\n", + " transforms.RandomResizedCrop(224),\n", + " transforms.RandomHorizontalFlip(),\n", + " transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.2),\n", + " transforms.RandomGrayscale(p=0.2),\n", + " transforms.GaussianBlur(kernel_size=5, sigma=(0.1, 2.0)),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])\n", + " ])\n", + " return train_transform\n", + "\n", + "\n", + "def train_dino(model, optimizer, train_loader, device):\n", + " \"\"\"\n", + " Training loop for DINO\n", + " \"\"\"\n", + " model.train()\n", + " for images, _ in train_loader:\n", + " # Get two augmentations of each image\n", + " x1, x2 = images.to(device), images.to(device)\n", + "\n", + " # Forward pass\n", + " p1, p2, t1, t2 = model(x1, x2)\n", + "\n", + " # Compute loss\n", + " loss1 = dino_loss(p1, t2)\n", + " loss2 = dino_loss(p2, t1)\n", + " loss = (loss1 + loss2) / 2\n", + "\n", + " # Backpropagate\n", + " optimizer.zero_grad()\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " # Update teacher network (EMA update)\n", + " model.update_teacher()\n", + "\n", + " # Optional: print loss for monitoring\n", + " print(f\"Loss: {loss.item()}\")\n", + "\n", + "\n", + "def extract_features(model, dataloader, device):\n", + " \"\"\"\n", + " Extract features using the DINO backbone\n", + " \"\"\"\n", + " model.eval()\n", + " all_features = []\n", + " all_labels = []\n", + "\n", + " with torch.no_grad():\n", + " for images, labels in dataloader:\n", + " images = images.to(device)\n", + " features = model.backbone(images)\n", + " all_features.append(features.cpu().numpy())\n", + " all_labels.append(labels.numpy())\n", + "\n", + " return np.concatenate(all_features), np.concatenate(all_labels)\n", + "\n", + "\n", + "def knn_evaluation(train_features, train_labels, test_features, test_labels, k=5):\n", + " \"\"\"\n", + " K-Nearest Neighbors evaluation\n", + " \"\"\"\n", + " scaler = StandardScaler()\n", + " train_features_scaled = scaler.fit_transform(train_features)\n", + " test_features_scaled = scaler.transform(test_features)\n", + "\n", + " knn = KNeighborsClassifier(n_neighbors=k)\n", + " knn.fit(train_features_scaled, train_labels)\n", + "\n", + " predictions = knn.predict(test_features_scaled)\n", + " accuracy = np.mean(predictions == test_labels)\n", + "\n", + " return accuracy\n", + "\n", + "\n", + "def linear_classifier_evaluation(train_features, train_labels, test_features, test_labels):\n", + " \"\"\"\n", + " Linear Classifier (Logistic Regression) evaluation\n", + " \"\"\"\n", + " scaler = StandardScaler()\n", + " train_features_scaled = scaler.fit_transform(train_features)\n", + " test_features_scaled = scaler.transform(test_features)\n", + "\n", + " linear_clf = LogisticRegression(max_iter=1000)\n", + " linear_clf.fit(train_features_scaled, train_labels)\n", + "\n", + " predictions = linear_clf.predict(test_features_scaled)\n", + " accuracy = np.mean(predictions == test_labels)\n", + "\n", + " return accuracy\n", + "\n", + "\n", + "def evaluate_representations(model, train_loader, test_loader, device, knn_k=5):\n", + " \"\"\"\n", + " Comprehensive evaluation of learned representations\n", + " \"\"\"\n", + " # Extract features\n", + " train_features, train_labels = extract_features(model, train_loader, device)\n", + " test_features, test_labels = extract_features(model, test_loader, device)\n", + "\n", + " # KNN Evaluation\n", + " knn_accuracy = knn_evaluation(train_features, train_labels,\n", + " test_features, test_labels, k=knn_k)\n", + "\n", + " # Linear Classifier Evaluation\n", + " linear_accuracy = linear_classifier_evaluation(train_features, train_labels,\n", + " test_features, test_labels)\n", + "\n", + " return {\n", + " 'knn_accuracy': knn_accuracy,\n", + " 'linear_classifier_accuracy': linear_accuracy\n", + " }\n", + "\n" + ], + "metadata": { + "id": "g5VuCgFuAKpV" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Example usage\n", + "if __name__ == '__main__':\n", + " # Hyperparameters\n", + " device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", + " model = DINO().to(device)\n", + " optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)\n", + "\n", + " # Note: This is a skeleton. Real implementation would require\n", + " # a complete dataset and proper data loaders\n", + " print(\"DINO model with ViT-Small/16 backbone initialized!\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "bGo7Etl0Zn8T", + "outputId": "ab9e220a-a8bd-45f7-d8f0-4bf0f5753558" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "DINO model with ViT-Small/16 backbone initialized!\n" + ] + } + ] + } + ] +} \ No newline at end of file