diff --git a/SEM-EVAL_with_SHAP.ipynb b/SEM-EVAL_with_SHAP.ipynb new file mode 100644 index 0000000..f418eaa --- /dev/null +++ b/SEM-EVAL_with_SHAP.ipynb @@ -0,0 +1,44836 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "mpl9UcM6E06W" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import torch\n", + "from torch.utils.data import Dataset\n", + "from torch.utils.data import DataLoader\n", + "from transformers import RobertaModel\n", + "from transformers import RobertaTokenizer\n", + "from transformers import AdamW\n", + "from sklearn.metrics import f1_score" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "DEDQl1g16gqn", + "outputId": "bcc530e7-f30d-40c5-f4db-e1a08e91e71a" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dynamic max length: 107\n" + ] + } + ], + "source": [ + "# Load the CSV file\n", + "def load_data(file_path):\n", + " df = pd.read_csv(file_path)\n", + " texts = df[\"text\"].tolist()\n", + " labels = df[[\"anger\", \"fear\", \"joy\", \"sadness\", \"surprise\"]].values\n", + " return texts, labels\n", + "\n", + "# Load train and test data\n", + "train_file = \"/content/sample_data/eng_Train.csv\"\n", + "test_file = \"/content/sample_data/eng_dev.csv\"\n", + "human_pred = \"/content/sample_data/eng_eng_test_50 labels.csv\"\n", + "\n", + "\n", + "train_texts, train_labels = load_data(train_file)\n", + "test_texts, test_labels = load_data(test_file)\n", + "h_texts, h_labels = load_data(human_pred)\n", + "\n", + "# Initialize tokenizer\n", + "tokenizer = RobertaTokenizer.from_pretrained(\"roberta-large\")\n", + "\n", + "# Find the max token length dynamically\n", + "def find_max_length(texts, tokenizer):\n", + " tokenized_texts = [tokenizer.tokenize(text) for text in texts]\n", + " return max(len(tokens) for tokens in tokenized_texts)\n", + "\n", + "max_length = find_max_length(train_texts + test_texts, tokenizer) # Find max length from both train & test\n", + "\n", + "print(f\"Dynamic max length: {max_length}\")\n", + "\n", + "# Tokenize with dynamic max_length\n", + "def tokenize_texts(texts, tokenizer, max_length):\n", + " return tokenizer(\n", + " texts,\n", + " max_length=max_length,\n", + " truncation=True,\n", + " padding=\"max_length\",\n", + " return_tensors=\"pt\",\n", + " )\n", + "\n", + "train_encodings = tokenize_texts(train_texts, tokenizer, max_length)\n", + "test_encodings = tokenize_texts(test_texts, tokenizer, max_length)\n", + "h_encodings = tokenize_texts(h_texts, tokenizer, max_length)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "id": "xNhCTMag6pqi" + }, + "outputs": [], + "source": [ + "class EmotionDataset(Dataset):\n", + " def __init__(self, encodings, labels):\n", + " self.encodings = encodings\n", + " self.labels = labels\n", + "\n", + " def __len__(self):\n", + " return len(self.labels)\n", + "\n", + " def __getitem__(self, idx):\n", + " return {\n", + " \"input_ids\": self.encodings[\"input_ids\"][idx],\n", + " \"attention_mask\": self.encodings[\"attention_mask\"][idx],\n", + " \"labels\": torch.tensor(self.labels[idx], dtype=torch.float),\n", + " }\n", + "\n", + "# Create datasets\n", + "train_dataset = EmotionDataset(train_encodings, train_labels)\n", + "test_dataset = EmotionDataset(test_encodings, test_labels)\n", + "h_dataset = EmotionDataset(h_encodings, h_labels)\n", + "class RobertaClass(torch.nn.Module):\n", + " def __init__(self, num_labels=5):\n", + " super(RobertaClass, self).__init__()\n", + " self.roberta = RobertaModel.from_pretrained(\"roberta-large\")\n", + " self.dropout = torch.nn.Dropout(0.5)\n", + "\n", + " # Additional fully connected layers\n", + " self.fc1 = torch.nn.Linear(1024, 512)\n", + " self.fc2 = torch.nn.Linear(512, num_labels)\n", + " #self.classifier = torch.nn.Linear(1024, num_labels) # 1024 is the output dimension of Roberta\n", + "\n", + " def forward(self, input_ids, attention_mask):\n", + " # Get the output from Roberta\n", + " output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)\n", + "\n", + " # We only care about the [CLS] token output for classification (i.e., first token in the sequence)\n", + " cls_output = output[0][:, 0, :] # (batch_size, hidden_size) - [CLS] token representation\n", + "\n", + " # Apply dropout for regularization\n", + " cls_output = self.dropout(cls_output)\n", + "\n", + " # Pass through the classifier to get the logits (raw output scores for each class)\n", + " #logits = self.classifier(cls_output)\n", + "\n", + " # Pass through the first fully connected layer\n", + " x = torch.nn.ReLU()(self.fc1(cls_output))\n", + "\n", + " # Apply dropout after the fully connected layer\n", + " x = self.dropout(x)\n", + "\n", + " # Final output layer for classification\n", + " logits = self.fc2(x)\n", + "\n", + " return logits" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5qWOrWIzHCWL" + }, + "source": [ + "Below lighter weight implementation of the Roberta Model. it uses the robot base for faster implementation, as I would like to test for SHAP and Running a full model on Collab is too time consuming." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "id": "-X_FIh_WHPgR" + }, + "outputs": [], + "source": [ + "import torch\n", + "from transformers import RobertaModel\n", + "\n", + "class LightweightRobertaClass(torch.nn.Module):\n", + " def __init__(self, num_labels=5):\n", + " super(LightweightRobertaClass, self).__init__()\n", + " self.roberta = RobertaModel.from_pretrained(\"roberta-base\") # Switch to base model\n", + " self.dropout = torch.nn.Dropout(0.2) # Reduce dropout\n", + " self.classifier = torch.nn.Linear(768, num_labels) # Directly classify from [CLS] token\n", + "\n", + " def forward(self, input_ids, attention_mask):\n", + " output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)\n", + " cls_output = output.last_hidden_state[:, 0, :] # [CLS] token representation\n", + " cls_output = self.dropout(cls_output)\n", + " logits = self.classifier(cls_output)\n", + " return logits\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 463 + }, + "id": "IM90eI047EhM", + "outputId": "9638d68e-05a8-4aae-f7de-81181da8c969" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + }, + { + "ename": "OutOfMemoryError", + "evalue": "CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 20.12 MiB is free. Process 4938 has 14.72 GiB memory in use. Of the allocated memory 14.17 GiB is allocated by PyTorch, and 428.25 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mOutOfMemoryError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0mloss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcriterion\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0mloss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 31\u001b[0;31m \u001b[0moptimizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 32\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[0mtotal_loss\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mloss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/optim/optimizer.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 485\u001b[0m )\n\u001b[1;32m 486\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 487\u001b[0;31m \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 488\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_optimizer_step_code\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 489\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/utils/_contextlib.py\u001b[0m in \u001b[0;36mdecorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 114\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mdecorate_context\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 115\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mctx_factory\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 116\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 117\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 118\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdecorate_context\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/transformers/optimization.py\u001b[0m in \u001b[0;36mstep\u001b[0;34m(self, closure)\u001b[0m\n\u001b[1;32m 646\u001b[0m \u001b[0mexp_avg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmul_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbeta1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgrad\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0malpha\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1.0\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mbeta1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 647\u001b[0m \u001b[0mexp_avg_sq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmul_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbeta2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maddcmul_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgrad\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgrad\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1.0\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mbeta2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 648\u001b[0;31m \u001b[0mdenom\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mexp_avg_sq\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msqrt\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgroup\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"eps\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 649\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 650\u001b[0m \u001b[0mstep_size\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgroup\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"lr\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mOutOfMemoryError\u001b[0m: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 20.12 MiB is free. Process 4938 has 14.72 GiB memory in use. Of the allocated memory 14.17 GiB is allocated by PyTorch, and 428.25 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)" + ] + } + ], + "source": [ + "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", + "# Example model initialization\n", + "model = RobertaClass(num_labels=5)\n", + "model.to(device)\n", + "\n", + "optimizer = AdamW(model.parameters(), lr=1e-4)\n", + "criterion = torch.nn.BCEWithLogitsLoss()\n", + "epochs = 20\n", + "\n", + "# Create data loaders\n", + "train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)\n", + "test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)\n", + "\n", + "# Example training loop\n", + "for epoch in range(epochs):\n", + " model.train()\n", + " total_loss = 0\n", + " for batch in train_loader:\n", + " input_ids = batch[\"input_ids\"].to(device)\n", + " attention_mask = batch[\"attention_mask\"].to(device)\n", + " labels = batch[\"labels\"].to(device)\n", + "\n", + " optimizer.zero_grad()\n", + "\n", + " # Forward pass\n", + " outputs = model(input_ids, attention_mask)\n", + "\n", + " # Compute the loss\n", + " loss = criterion(outputs, labels)\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " total_loss += loss.item()\n", + "\n", + " print(f\"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 411 + }, + "id": "5A4wHxoZ7H9J", + "outputId": "e2cf4b43-8c97-46fa-96ea-2dc32f31334e" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 40\u001b[0m \u001b[0;31m# Compute loss\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 41\u001b[0m \u001b[0mloss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcriterion\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 42\u001b[0;31m \u001b[0mloss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 43\u001b[0m \u001b[0moptimizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 44\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/_tensor.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m 579\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 580\u001b[0m )\n\u001b[0;32m--> 581\u001b[0;31m torch.autograd.backward(\n\u001b[0m\u001b[1;32m 582\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgradient\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretain_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcreate_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 583\u001b[0m )\n", + "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/autograd/__init__.py\u001b[0m in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m 345\u001b[0m \u001b[0;31m# some Python versions print out the first line of a multi-line function\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 346\u001b[0m \u001b[0;31m# calls in the traceback and some print out the last line\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 347\u001b[0;31m _engine_run_backward(\n\u001b[0m\u001b[1;32m 348\u001b[0m \u001b[0mtensors\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 349\u001b[0m \u001b[0mgrad_tensors_\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/autograd/graph.py\u001b[0m in \u001b[0;36m_engine_run_backward\u001b[0;34m(t_outputs, *args, **kwargs)\u001b[0m\n\u001b[1;32m 823\u001b[0m \u001b[0munregister_hooks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_register_logging_hooks_on_whole_graph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mt_outputs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 824\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 825\u001b[0;31m return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass\n\u001b[0m\u001b[1;32m 826\u001b[0m \u001b[0mt_outputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 827\u001b[0m ) # Calls into the C++ engine to run the backward pass\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "#Training Model\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "\n", + "# Model, optimizer, and criterion initialization\n", + "model = RobertaClass(num_labels=5)\n", + "model.to(device)\n", + "epochs = 20\n", + "\n", + "# Create data loaders\n", + "train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)\n", + "test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)\n", + "\n", + "optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8, weight_decay=1e-5) # Adjusted learning rate (1e-5 decay)\n", + "criterion = torch.nn.BCEWithLogitsLoss()\n", + "\n", + "# Scheduler for learning rate decay\n", + "scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) #step_size =2\n", + "\n", + "# Data loaders (batch size increased)\n", + "train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)\n", + "test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)\n", + "\n", + "# Early stopping variables\n", + "best_loss = float('inf')\n", + "epochs = 20\n", + "\n", + "for epoch in range(epochs):\n", + " model.train()\n", + " total_loss = 0\n", + " for batch in train_loader:\n", + " input_ids = batch[\"input_ids\"].to(device)\n", + " attention_mask = batch[\"attention_mask\"].to(device)\n", + " labels = batch[\"labels\"].to(device)\n", + "\n", + " optimizer.zero_grad()\n", + "\n", + " # Forward pass\n", + " outputs = model(input_ids, attention_mask)\n", + "\n", + " # Compute loss\n", + " loss = criterion(outputs, labels)\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " total_loss += loss.item()\n", + "\n", + " scheduler.step() # Update the learning rate based on the scheduler\n", + "\n", + " # Print training loss\n", + " print(f\"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader)}\")\n", + "\n", + " # Save the best model based on loss\n", + " if total_loss < best_loss:\n", + " best_loss = total_loss\n", + " torch.save(model.state_dict(), \"best_model.pt\")\n", + "\n", + "# Optionally load the best model for evaluation\n", + "model.load_state_dict(torch.load(\"best_model.pt\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Pn3Y39wGIFZc" + }, + "source": [ + "Another lighter version of the code for speed.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 463 + }, + "id": "TNoWet7dILHe", + "outputId": "4d840bd3-4f7b-4f79-d6dc-4a026359933e" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + }, + { + "ename": "OutOfMemoryError", + "evalue": "CUDA out of memory. Tried to allocate 148.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 20.12 MiB is free. Process 4938 has 14.72 GiB memory in use. Of the allocated memory 14.17 GiB is allocated by PyTorch, and 428.25 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mOutOfMemoryError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;31m# Use optimized model\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mLightweightRobertaClass\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnum_labels\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mepochs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m3\u001b[0m \u001b[0;31m# Reduced epochs for quick SHAP testing\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36mto\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1338\u001b[0m \u001b[0;32mraise\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1339\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1340\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mconvert\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1341\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1342\u001b[0m def register_full_backward_pre_hook(\n", + "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_apply\u001b[0;34m(self, fn, recurse)\u001b[0m\n\u001b[1;32m 898\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrecurse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 899\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mmodule\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchildren\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 900\u001b[0;31m \u001b[0mmodule\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 901\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 902\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcompute_should_use_set_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtensor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtensor_applied\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_apply\u001b[0;34m(self, fn, recurse)\u001b[0m\n\u001b[1;32m 898\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrecurse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 899\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mmodule\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchildren\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 900\u001b[0;31m \u001b[0mmodule\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 901\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 902\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcompute_should_use_set_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtensor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtensor_applied\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_apply\u001b[0;34m(self, fn, recurse)\u001b[0m\n\u001b[1;32m 898\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrecurse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 899\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mmodule\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchildren\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 900\u001b[0;31m \u001b[0mmodule\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 901\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 902\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcompute_should_use_set_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtensor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtensor_applied\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_apply\u001b[0;34m(self, fn, recurse)\u001b[0m\n\u001b[1;32m 925\u001b[0m \u001b[0;31m# `with torch.no_grad():`\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 926\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mno_grad\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 927\u001b[0;31m \u001b[0mparam_applied\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparam\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 928\u001b[0m \u001b[0mp_should_use_set_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcompute_should_use_set_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparam\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparam_applied\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 929\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36mconvert\u001b[0;34m(t)\u001b[0m\n\u001b[1;32m 1324\u001b[0m \u001b[0mmemory_format\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mconvert_to_format\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1325\u001b[0m )\n\u001b[0;32m-> 1326\u001b[0;31m return t.to(\n\u001b[0m\u001b[1;32m 1327\u001b[0m \u001b[0mdevice\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1328\u001b[0m \u001b[0mdtype\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_floating_point\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_complex\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mOutOfMemoryError\u001b[0m: CUDA out of memory. Tried to allocate 148.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 20.12 MiB is free. Process 4938 has 14.72 GiB memory in use. Of the allocated memory 14.17 GiB is allocated by PyTorch, and 428.25 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)" + ] + } + ], + "source": [ + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "\n", + "# Use optimized model\n", + "model = LightweightRobertaClass(num_labels=5)\n", + "model.to(device)\n", + "\n", + "epochs = 3 # Reduced epochs for quick SHAP testing\n", + "batch_size = 16 # Larger batch size\n", + "\n", + "# Create DataLoaders\n", + "train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)\n", + "test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)\n", + "\n", + "optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, eps=1e-8)\n", + "scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)\n", + "criterion = torch.nn.BCEWithLogitsLoss()\n", + "scaler = torch.cuda.amp.GradScaler() # Enable mixed precision\n", + "\n", + "best_loss = float('inf')\n", + "\n", + "for epoch in range(epochs):\n", + " model.train()\n", + " total_loss = 0\n", + " for batch in train_loader:\n", + " input_ids = batch[\"input_ids\"].to(device)\n", + " attention_mask = batch[\"attention_mask\"].to(device)\n", + " labels = batch[\"labels\"].to(device)\n", + "\n", + " optimizer.zero_grad()\n", + "\n", + " with torch.cuda.amp.autocast(): # Mixed precision\n", + " outputs = model(input_ids, attention_mask)\n", + " loss = criterion(outputs, labels)\n", + "\n", + " scaler.scale(loss).backward()\n", + " scaler.step(optimizer)\n", + " scaler.update()\n", + "\n", + " total_loss += loss.item()\n", + "\n", + " scheduler.step()\n", + " print(f\"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader)}\")\n", + "\n", + " if total_loss < best_loss:\n", + " best_loss = total_loss\n", + " torch.save(model.state_dict(), \"best_model.pt\")\n", + "\n", + "# Load best model for SHAP testing\n", + "model.load_state_dict(torch.load(\"best_model.pt\"))\n", + "\n", + "# Run SHAP with no gradient calculations (faster)\n", + "model.eval()\n", + "with torch.no_grad():\n", + " for batch in test_loader:\n", + " input_ids = batch[\"input_ids\"].to(device)\n", + " attention_mask = batch[\"attention_mask\"].to(device)\n", + " outputs = model(input_ids, attention_mask)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 428 + }, + "id": "WPoi4QAj7LKi", + "outputId": "25a3041c-21ff-455f-b6f5-644bf8c79964" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.11/dist-packages/transformers/optimization.py:591: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", + " warnings.warn(\n" + ] + }, + { + "ename": "RuntimeError", + "evalue": "Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 49\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 50\u001b[0m \u001b[0moptimizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mzero_grad\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 51\u001b[0;31m \u001b[0moutputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_ids\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattention_mask\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 52\u001b[0m \u001b[0mloss\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcriterion\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 53\u001b[0m \u001b[0mloss\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1734\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compiled_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# type: ignore[misc]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1735\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1736\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1737\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1738\u001b[0m \u001b[0;31m# torchrec tests the code consistency with the following code\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1745\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_pre_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_hooks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1746\u001b[0m or _global_forward_hooks or _global_forward_pre_hooks):\n\u001b[0;32m-> 1747\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mforward_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1748\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1749\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input_ids, attention_mask)\u001b[0m\n\u001b[1;32m 31\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput_ids\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattention_mask\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 32\u001b[0m \u001b[0;31m# Get the output from Roberta\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 33\u001b[0;31m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mroberta\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_ids\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minput_ids\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattention_mask\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mattention_mask\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 34\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 35\u001b[0m \u001b[0;31m# We only care about the [CLS] token output for classification (i.e., first token in the sequence)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1734\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compiled_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# type: ignore[misc]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1735\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1736\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1737\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1738\u001b[0m \u001b[0;31m# torchrec tests the code consistency with the following code\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1745\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_pre_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_hooks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1746\u001b[0m or _global_forward_hooks or _global_forward_pre_hooks):\n\u001b[0;32m-> 1747\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mforward_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1748\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1749\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/transformers/models/roberta/modeling_roberta.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 910\u001b[0m \u001b[0mtoken_type_ids\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mzeros\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_shape\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlong\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdevice\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 911\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 912\u001b[0;31m embedding_output = self.embeddings(\n\u001b[0m\u001b[1;32m 913\u001b[0m \u001b[0minput_ids\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minput_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 914\u001b[0m \u001b[0mposition_ids\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mposition_ids\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1734\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compiled_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# type: ignore[misc]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1735\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1736\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1737\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1738\u001b[0m \u001b[0;31m# torchrec tests the code consistency with the following code\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1745\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_pre_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_hooks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1746\u001b[0m or _global_forward_hooks or _global_forward_pre_hooks):\n\u001b[0;32m-> 1747\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mforward_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1748\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1749\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/transformers/models/roberta/modeling_roberta.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input_ids, token_type_ids, position_ids, inputs_embeds, past_key_values_length)\u001b[0m\n\u001b[1;32m 120\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 121\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0minputs_embeds\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 122\u001b[0;31m \u001b[0minputs_embeds\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mword_embeddings\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput_ids\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 123\u001b[0m \u001b[0mtoken_type_embeddings\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoken_type_embeddings\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtoken_type_ids\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 124\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1734\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_compiled_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# type: ignore[misc]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1735\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1736\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_call_impl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1737\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1738\u001b[0m \u001b[0;31m# torchrec tests the code consistency with the following code\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1745\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_pre_hooks\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0m_global_backward_hooks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1746\u001b[0m or _global_forward_hooks or _global_forward_pre_hooks):\n\u001b[0;32m-> 1747\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mforward_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1748\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1749\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/nn/modules/sparse.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m 188\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 189\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mTensor\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mTensor\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 190\u001b[0;31m return F.embedding(\n\u001b[0m\u001b[1;32m 191\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 192\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mweight\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.11/dist-packages/torch/nn/functional.py\u001b[0m in \u001b[0;36membedding\u001b[0;34m(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)\u001b[0m\n\u001b[1;32m 2549\u001b[0m \u001b[0;31m# remove once script supports set_grad_enabled\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2550\u001b[0m \u001b[0m_no_grad_embedding_renorm_\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mweight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax_norm\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnorm_type\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2551\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0membedding\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mweight\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpadding_idx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscale_grad_by_freq\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msparse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2552\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2553\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mRuntimeError\u001b[0m: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)" + ] + } + ], + "source": [ + "# Function to evaluate F1 score on validation set\n", + "from transformers import get_linear_schedule_with_warmup\n", + "\n", + "def evaluate(model, val_loader, criterion):\n", + " model.eval()\n", + " val_loss = 0\n", + " all_preds = []\n", + " all_labels = []\n", + "\n", + " with torch.no_grad():\n", + " for batch in test_loader:\n", + " input_ids = batch[\"input_ids\"].to(device)\n", + " attention_mask = batch[\"attention_mask\"].to(device)\n", + " labels = batch[\"labels\"].to(device)\n", + "\n", + " outputs = model(input_ids, attention_mask)\n", + " loss = criterion(outputs, labels)\n", + " val_loss += loss.item()\n", + "\n", + " preds = (torch.sigmoid(outputs) > 0.5).float()\n", + " all_preds.append(preds.cpu().numpy())\n", + " all_labels.append(labels.cpu().numpy())\n", + "\n", + " val_loss /= len(val_loader)\n", + " all_preds = np.vstack(all_preds)\n", + " all_labels = np.vstack(all_labels)\n", + " f1 = f1_score(all_labels, all_preds, average='micro') # Or use 'macro' for equal weighting\n", + "\n", + " return val_loss, f1\n", + "\n", + "optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)\n", + "\n", + "total_steps = len(train_loader) * epochs\n", + "scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.1 * total_steps, num_training_steps=total_steps)\n", + "\n", + "\n", + "# Training loop with validation evaluation\n", + "best_f1 = 0.0\n", + "best_loss = float(\"inf\")\n", + "\n", + "for epoch in range(epochs):\n", + " model.train()\n", + " total_loss = 0\n", + "\n", + " for batch in train_loader:\n", + " input_ids = batch[\"input_ids\"].to(device)\n", + " attention_mask = batch[\"attention_mask\"].to(device)\n", + " labels = batch[\"labels\"].to(device)\n", + "\n", + " optimizer.zero_grad()\n", + " outputs = model(input_ids, attention_mask)\n", + " loss = criterion(outputs, labels)\n", + " loss.backward()\n", + " torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Gradient clipping\n", + " optimizer.step()\n", + "\n", + " total_loss += loss.item()\n", + "\n", + " scheduler.step() # Update learning rate\n", + " avg_train_loss = total_loss / len(train_loader)\n", + "\n", + " # **Evaluate on validation set**\n", + " val_loss, val_f1 = evaluate(model, test_loader, criterion)\n", + "\n", + " print(f\"Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}, Val F1: {val_f1:.4f}\")\n", + "\n", + " # **Save best model based on F1 score**\n", + " if val_f1 > best_f1:\n", + " best_f1 = val_f1\n", + " torch.save(model.state_dict(), \"best_model_f1.pt\")\n", + "\n", + "print(f\"Best Validation F1 Score: {best_f1:.4f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "q7KXLlll7PYx", + "outputId": "1060dfa9-c532-4aaa-cae6-75ed937f43a3" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "F1 Score per label:\n", + "Label 0 F1 Score: 0.7857\n", + "Label 1 F1 Score: 0.7591\n", + "Label 2 F1 Score: 0.6792\n", + "Label 3 F1 Score: 0.6875\n", + "Label 4 F1 Score: 0.6441\n" + ] + } + ], + "source": [ + "# Evaluation\n", + "model.eval()\n", + "all_preds = []\n", + "all_labels = []\n", + "test_loss = 0.0\n", + "correct = 0\n", + "total = 0\n", + "\n", + "with torch.no_grad():\n", + " for batch in test_loader:\n", + " input_ids = batch[\"input_ids\"].to(device)\n", + " attention_mask = batch[\"attention_mask\"].to(device)\n", + " labels = batch[\"labels\"].to(device)\n", + "\n", + " # Forward pass\n", + " outputs = model(input_ids, attention_mask)\n", + "\n", + " # Apply sigmoid and threshold to get binary predictions\n", + " #preds = (torch.sigmoid(outputs) > 0.5).float() # Threshold at 0.5 for multi-label\n", + " thresholds = torch.tensor([0.45, 0.5, 0.5, 0.5, 0.5]).to(device) # Lower threshold for Label 0\n", + " preds = (torch.sigmoid(outputs) > thresholds).float()\n", + "\n", + "\n", + " # Collect predictions and labels for F1 score\n", + " all_preds.append(preds.cpu().numpy())\n", + " all_labels.append(labels.cpu().numpy())\n", + "\n", + "# Flatten lists and calculate F1 score\n", + "all_preds = np.vstack(all_preds) # Shape: (num_samples, num_labels)\n", + "all_labels = np.vstack(all_labels) # Shape: (num_samples, num_labels)\n", + "\n", + "# Calculate F1 score for each label individually\n", + "f1_per_label = f1_score(all_labels, all_preds, average=None) # F1 score for each label\n", + "print(\"F1 Score per label:\")\n", + "for idx, score in enumerate(f1_per_label):\n", + " print(f\"Label {idx} F1 Score: {score:.4f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xjHqJqm4E3JY" + }, + "source": [ + "\n", + "# New Section" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "NW38FunsRnMu", + "outputId": "4fb7b676-9eb0-44a1-d56e-15974f60f1d9" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + "/usr/local/lib/python3.11/dist-packages/transformers/optimization.py:591: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/5, Loss: 0.49888609630220077\n", + "Epoch 2/5, Loss: 0.3397563512710964\n", + "Epoch 3/5, Loss: 0.26328266867819955\n", + "Epoch 4/5, Loss: 0.2056334319360116\n", + "Epoch 5/5, Loss: 0.17322912316988495\n", + "Model training complete. Model saved to /content/best_model.pt\n" + ] + } + ], + "source": [ + "import torch\n", + "import numpy as np\n", + "import pandas as pd\n", + "from torch.utils.data import Dataset, DataLoader\n", + "from transformers import RobertaTokenizer, RobertaModel, AdamW, get_scheduler\n", + "from sklearn.metrics import f1_score\n", + "\n", + "# Set device\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "\n", + "# Load the CSV file\n", + "def load_data(file_path):\n", + " df = pd.read_csv(file_path)\n", + " texts = df[\"text\"].tolist()\n", + " labels = df[[\"anger\", \"fear\", \"joy\", \"sadness\", \"surprise\"]].values\n", + " return texts, labels\n", + "\n", + "# File paths\n", + "train_file = \"/content/sample_data/eng_Train.csv\"\n", + "test_file = \"/content/sample_data/eng_dev.csv\"\n", + "\n", + "train_texts, train_labels = load_data(train_file)\n", + "test_texts, test_labels = load_data(test_file)\n", + "\n", + "# Initialize tokenizer\n", + "tokenizer = RobertaTokenizer.from_pretrained(\"roberta-base\")\n", + "\n", + "def tokenize_texts(texts, tokenizer, max_length=64):\n", + " return tokenizer(\n", + " texts,\n", + " max_length=max_length,\n", + " truncation=True,\n", + " padding=\"max_length\",\n", + " return_tensors=\"pt\",\n", + " )\n", + "\n", + "train_encodings = tokenize_texts(train_texts, tokenizer)\n", + "test_encodings = tokenize_texts(test_texts, tokenizer)\n", + "\n", + "# Dataset Class\n", + "class EmotionDataset(Dataset):\n", + " def __init__(self, encodings, labels):\n", + " self.encodings = encodings\n", + " self.labels = labels\n", + "\n", + " def __len__(self):\n", + " return len(self.labels)\n", + "\n", + " def __getitem__(self, idx):\n", + " return {\n", + " \"input_ids\": self.encodings[\"input_ids\"][idx],\n", + " \"attention_mask\": self.encodings[\"attention_mask\"][idx],\n", + " \"labels\": torch.tensor(self.labels[idx], dtype=torch.float),\n", + " }\n", + "\n", + "train_dataset = EmotionDataset(train_encodings, train_labels)\n", + "test_dataset = EmotionDataset(test_encodings, test_labels)\n", + "\n", + "# Lightweight RoBERTa Model\n", + "class LightweightRobertaClass(torch.nn.Module):\n", + " def __init__(self, num_labels=5):\n", + " super(LightweightRobertaClass, self).__init__()\n", + " self.roberta = RobertaModel.from_pretrained(\"roberta-base\")\n", + " self.dropout = torch.nn.Dropout(0.3)\n", + " self.classifier = torch.nn.Linear(768, num_labels)\n", + "\n", + " def forward(self, input_ids, attention_mask):\n", + " output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)\n", + " cls_output = output.last_hidden_state[:, 0, :]\n", + " cls_output = self.dropout(cls_output)\n", + " logits = self.classifier(cls_output)\n", + " return logits\n", + "\n", + "# Training\n", + "model = LightweightRobertaClass(num_labels=5).to(device)\n", + "optimizer = AdamW(model.parameters(), lr=2e-5)\n", + "criterion = torch.nn.BCEWithLogitsLoss()\n", + "epochs = 5\n", + "batch_size = 16\n", + "\n", + "train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)\n", + "test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)\n", + "\n", + "scheduler = get_scheduler(\"linear\", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs)\n", + "\n", + "for epoch in range(epochs):\n", + " model.train()\n", + " total_loss = 0\n", + " for batch in train_loader:\n", + " input_ids = batch[\"input_ids\"].to(device)\n", + " attention_mask = batch[\"attention_mask\"].to(device)\n", + " labels = batch[\"labels\"].to(device)\n", + "\n", + " optimizer.zero_grad()\n", + " outputs = model(input_ids, attention_mask)\n", + " loss = criterion(outputs, labels)\n", + " loss.backward()\n", + " optimizer.step()\n", + " scheduler.step()\n", + " total_loss += loss.item()\n", + "\n", + " print(f\"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader)}\")\n", + "\n", + "# Save model\n", + "torch.save(model.state_dict(), \"/content/best_model.pt\")\n", + "print(\"Model training complete. Model saved to /content/best_model.pt\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "QWUg_WFMd3dJ", + "outputId": "c4c081ab-caa9-4155-8c3e-de99b358210d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "F1 Score per label:\n", + "Label 0 F1 Score: 0.7333\n", + "Label 1 F1 Score: 0.7463\n", + "Label 2 F1 Score: 0.6545\n", + "Label 3 F1 Score: 0.6984\n", + "Label 4 F1 Score: 0.6557\n" + ] + } + ], + "source": [ + "# Evaluation\n", + "model.eval()\n", + "all_preds = []\n", + "all_labels = []\n", + "test_loss = 0.0\n", + "correct = 0\n", + "total = 0\n", + "\n", + "with torch.no_grad():\n", + " for batch in test_loader:\n", + " input_ids = batch[\"input_ids\"].to(device)\n", + " attention_mask = batch[\"attention_mask\"].to(device)\n", + " labels = batch[\"labels\"].to(device)\n", + "\n", + " # Forward pass\n", + " outputs = model(input_ids, attention_mask)\n", + "\n", + " # Apply sigmoid and threshold to get binary predictions\n", + " #preds = (torch.sigmoid(outputs) > 0.5).float() # Threshold at 0.5 for multi-label\n", + " thresholds = torch.tensor([0.45, 0.5, 0.5, 0.5, 0.5]).to(device) # Lower threshold for Label 0\n", + " preds = (torch.sigmoid(outputs) > thresholds).float()\n", + "\n", + "\n", + " # Collect predictions and labels for F1 score\n", + " all_preds.append(preds.cpu().numpy())\n", + " all_labels.append(labels.cpu().numpy())\n", + "\n", + "# Flatten lists and calculate F1 score\n", + "all_preds = np.vstack(all_preds) # Shape: (num_samples, num_labels)\n", + "all_labels = np.vstack(all_labels) # Shape: (num_samples, num_labels)\n", + "\n", + "# Calculate F1 score for each label individually\n", + "f1_per_label = f1_score(all_labels, all_preds, average=None) # F1 score for each label\n", + "print(\"F1 Score per label:\")\n", + "for idx, score in enumerate(f1_per_label):\n", + " print(f\"Label {idx} F1 Score: {score:.4f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": { + "id": "YHQVFUDJSrb1" + }, + "outputs": [], + "source": [ + "import shap\n", + "import torch\n", + "import numpy as np\n", + "import pandas as pd\n", + "from transformers import RobertaTokenizer\n", + "from torch.utils.data import DataLoader\n", + "\n", + "# Set device\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "\n", + "# Load the development dataset (this file has labels)\n", + "dev_file = \"/content/sample_data/eng_dev.csv\"\n", + "\n", + "# Load CSV file\n", + "def load_data(file_path):\n", + " df = pd.read_csv(file_path)\n", + " texts = df[\"text\"].tolist()\n", + " labels = df[[\"anger\", \"fear\", \"joy\", \"sadness\", \"surprise\"]].values\n", + " return texts, labels\n", + "\n", + "dev_texts, dev_labels = load_data(dev_file)\n", + "\n", + "# Load tokenizer\n", + "tokenizer = RobertaTokenizer.from_pretrained(\"roberta-base\")\n", + "\n", + "# Tokenize function\n", + "def tokenize_texts(texts, tokenizer, max_length=64):\n", + " return tokenizer(\n", + " texts,\n", + " max_length=max_length,\n", + " truncation=True,\n", + " padding=\"max_length\",\n", + " return_tensors=\"pt\",\n", + " )\n", + "\n", + "dev_encodings = tokenize_texts(dev_texts, tokenizer)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "UPVSLH7KUKV9", + "outputId": "00524632-91b4-4fb4-d0d8-53044e4c9aa0" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + ":20: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " model.load_state_dict(torch.load(\"/content/best_model.pt\", map_location=device))\n" + ] + }, + { + "data": { + "text/plain": [ + "LightweightRobertaClass(\n", + " (roberta): RobertaModel(\n", + " (embeddings): RobertaEmbeddings(\n", + " (word_embeddings): Embedding(50265, 768, padding_idx=1)\n", + " (position_embeddings): Embedding(514, 768, padding_idx=1)\n", + " (token_type_embeddings): Embedding(1, 768)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (encoder): RobertaEncoder(\n", + " (layer): ModuleList(\n", + " (0-11): 12 x RobertaLayer(\n", + " (attention): RobertaAttention(\n", + " (self): RobertaSdpaSelfAttention(\n", + " (query): Linear(in_features=768, out_features=768, bias=True)\n", + " (key): Linear(in_features=768, out_features=768, bias=True)\n", + " (value): Linear(in_features=768, out_features=768, bias=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (output): RobertaSelfOutput(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " (intermediate): RobertaIntermediate(\n", + " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", + " (intermediate_act_fn): GELUActivation()\n", + " )\n", + " (output): RobertaOutput(\n", + " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", + " (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " )\n", + " )\n", + " (pooler): RobertaPooler(\n", + " (dense): Linear(in_features=768, out_features=768, bias=True)\n", + " (activation): Tanh()\n", + " )\n", + " )\n", + " (dropout): Dropout(p=0.3, inplace=False)\n", + " (classifier): Linear(in_features=768, out_features=5, bias=True)\n", + ")" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from transformers import RobertaModel\n", + "\n", + "# Define the same model structure used for training\n", + "class LightweightRobertaClass(torch.nn.Module):\n", + " def __init__(self, num_labels=5):\n", + " super(LightweightRobertaClass, self).__init__()\n", + " self.roberta = RobertaModel.from_pretrained(\"roberta-base\")\n", + " self.dropout = torch.nn.Dropout(0.3)\n", + " self.classifier = torch.nn.Linear(768, num_labels)\n", + "\n", + " def forward(self, input_ids, attention_mask):\n", + " output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)\n", + " cls_output = output.last_hidden_state[:, 0, :]\n", + " cls_output = self.dropout(cls_output)\n", + " logits = self.classifier(cls_output)\n", + " return logits\n", + "\n", + "# Load trained model\n", + "model = LightweightRobertaClass(num_labels=5)\n", + "model.load_state_dict(torch.load(\"/content/best_model.pt\", map_location=device))\n", + "model.to(device)\n", + "model.eval() # Set model to evaluation mode\n" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "id": "EzB-7y4gVL14" + }, + "outputs": [], + "source": [ + "def predict(input_texts):\n", + " \"\"\" Tokenizes text and runs through the model for SHAP \"\"\"\n", + " encodings = tokenizer(\n", + " input_texts, padding=True, truncation=True, max_length=64, return_tensors=\"pt\"\n", + " ).to(device)\n", + "\n", + " with torch.no_grad():\n", + " outputs = model(encodings[\"input_ids\"], encodings[\"attention_mask\"])\n", + "\n", + " return torch.sigmoid(outputs).cpu().numpy() # Convert logits to probabilities\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uBpZcPMxmYby" + }, + "source": [ + "### Below is a basic implementation of shap. A battle implementation is in the next code segment." + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "t1lx9Y_4VO1_", + "outputId": "9eaec16c-7e40-40ab-ba6d-4c1064ed67fc" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
[0]
\n", + "
\n", + "
\n", + "\n", + "
outputs
\n", + "
Output 0
\n", + "
Output 1
\n", + "
Output 2
\n", + "
Output 3
\n", + "
Output 4


0.30.1-0.10.50.70.90.1284670.128467base value0.08934880.0893488fOutput 0(inputs)0.016 ĠSc 0.013 Ġat 0.013 Ġa 0.011 Ġtime 0.01 ĠStacy 0.004 Ġsister 0.001 O 0.0 -0.032 ) -0.019 23 -0.017 umb -0.016 . -0.01 ag -0.01 Ġ( -0.003 Ġthe -0.001 Ġis -0.0 lder -0.0
inputs
-0.0
0.001
O
-0.0
lder
0.004
Ġsister
-0.01
Ġ(
-0.019
23
0.013
Ġat
-0.003
Ġthe
0.011
Ġtime
-0.032
)
-0.001
Ġis
0.013
Ġa
0.016
ĠSc
-0.017
umb
-0.01
ag
0.01
ĠStacy
-0.016
.
0.0