addressing PR comments

Xuzzo · Xuzzo · commit a90f0414ccbb · 2024-01-15T16:48:34.000+01:00
diff --git a/notebooks/influence_sentiment_analysis.ipynb b/notebooks/influence_sentiment_analysis.ipynb
@@ -83,16 +83,19 @@
     }
    ],
    "source": [
-    "from datasets import load_dataset\n",
-    "import torch\n",
-    "from sklearn.metrics import f1_score\n",
+    "from copy import deepcopy\n",
     "from typing import Sequence\n",
-    "from pydvl.influence.torch import EkfacInfluence\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import torch\n",
     "import torch.nn.functional as F\n",
-    "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n",
-    "from copy import deepcopy\n",
+    "from datasets import load_dataset\n",
     "from IPython.display import HTML, display\n",
-    "import matplotlib.pyplot as plt"
+    "from sklearn.metrics import f1_score\n",
+    "from transformers import AutoModelForSequenceClassification, AutoTokenizer\n",
+    "\n",
+    "from pydvl.influence.torch import EkfacInfluence\n",
+    "from support.torch import ImdbDataset, ModelLogitsWrapper"
    ]
   },
   {
@@ -156,8 +159,9 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "Using the latest cached version of the module from /Users/fabio/.cache/huggingface/modules/datasets_modules/datasets/imdb/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0 (last modified on Thu Dec 14 21:47:25 2023) since it couldn't be found locally at imdb., or remotely on the Hugging Face Hub.\n",
       "Found cached dataset imdb (/Users/fabio/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)\n",
-      "100%|██████████| 3/3 [00:00<00:00, 136.16it/s]\n"
+      "100%|██████████| 3/3 [00:00<00:00, 111.43it/s]\n"
      ]
     }
    ],
@@ -265,18 +269,11 @@
     "tokenized_example = tokenizer(\n",
     "    [example_phrase],\n",
     "    return_tensors=\"pt\",\n",
-    "    padding=True,\n",
     "    truncation=True,\n",
     ")\n",
     "\n",
-    "tokenized_example_input_ids, tokenized_example_attention_mask = (\n",
-    "    tokenized_example.input_ids,\n",
-    "    tokenized_example.attention_mask,\n",
-    ")\n",
-    "\n",
     "model_output = model(\n",
-    "    input_ids=tokenized_example_input_ids,\n",
-    "    attention_mask=tokenized_example_attention_mask,\n",
+    "    input_ids=tokenized_example.input_ids,\n",
     ")"
    ]
   },
@@ -322,13 +319,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model_predictions = F.softmax(\n",
-    "    model(\n",
-    "        input_ids=tokenized_example_input_ids,\n",
-    "        attention_mask=tokenized_example_attention_mask,\n",
-    "    )[\"logits\"],\n",
-    "    dim=1,\n",
-    ")"
+    "model_predictions = F.softmax(model_output.logits, dim=1)"
    ]
   },
   {
@@ -386,7 +377,7 @@
      "output_type": "stream",
      "text": [
       "Loading cached shuffled indices for dataset at /Users/fabio/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-c1eaa46e94dfbfd3.arrow\n",
-      "Loading cached processed dataset at /Users/fabio/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-c5cc0d728c27151c.arrow\n"
+      "Loading cached processed dataset at /Users/fabio/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-5dd4cdcbaa0bcc93.arrow\n"
      ]
     }
    ],
@@ -402,7 +393,7 @@
     "    logits = model(\n",
     "        input_ids=sample_test_set[\"input_ids\"],\n",
     "        attention_mask=sample_test_set[\"attention_mask\"],\n",
-    "    )[0]\n",
+    "    ).logits\n",
     "    predictions = torch.argmax(logits, dim=1)"
    ]
   },
@@ -435,7 +426,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "In this section we will define several helper function and classes that will be used in the rest of the notebook. "
+    "In this section we will define two helper function and classes that will be used in the rest of the notebook. "
    ]
   },
   {
@@ -444,47 +435,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "class ImdbDataset(torch.utils.data.Dataset):\n",
-    "    \"\"\"\n",
-    "    A PyTorch Dataset that takes in an HuggingFace Dataset object and tokenizes it.\n",
-    "    The objects returned by __getitem__ are PyTorch tensors, with x being a tuple of\n",
-    "    (input_ids, attention_mask), ready to be fed into a model, and y being the label.\n",
-    "    It also returns the original text, for printing and debugging purposes.\n",
-    "    \"\"\"\n",
-    "\n",
-    "    def __init__(self, dataset):\n",
-    "        self.tokenized_ds = dataset.map(self.preprocess_function, batched=True)\n",
-    "        self.encodings = self.tokenized_ds[\"input_ids\"]\n",
-    "        self.attn_mask = self.tokenized_ds[\"attention_mask\"]\n",
-    "        self.labels = self.tokenized_ds[\"label\"]\n",
-    "\n",
-    "    def preprocess_function(self, examples):\n",
-    "        return tokenizer(examples[\"text\"], truncation=True, padding=True)\n",
-    "\n",
-    "    def __getitem__(self, idx):\n",
-    "        x = torch.tensor([self.encodings[idx], self.attn_mask[idx]])\n",
-    "        y = torch.tensor(self.labels[idx])\n",
-    "        text = self.tokenized_ds[idx][\"text\"]\n",
-    "        return x, y, text\n",
-    "\n",
-    "    def __len__(self):\n",
-    "        return len(self.labels)\n",
-    "\n",
-    "\n",
-    "class ModelLogitsWrapper(torch.nn.Module):\n",
-    "    \"\"\"\n",
-    "    A wrapper around a PyTorch model that returns only the logits and not the loss or\n",
-    "    the attention mask.\n",
-    "    \"\"\"\n",
-    "\n",
-    "    def __init__(self, model):\n",
-    "        super().__init__()\n",
-    "        self.model = model\n",
-    "\n",
-    "    def forward(self, x):\n",
-    "        return self.model(x[:, 0], x[:, 1])[\"logits\"]\n",
-    "\n",
-    "\n",
     "def print_sentiment_preds(\n",
     "    model: ModelLogitsWrapper, model_input: torch.Tensor, true_label: int\n",
     "):\n",
@@ -620,8 +570,8 @@
      "text": [
       "Loading cached shuffled indices for dataset at /Users/fabio/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-9c48ce5d173413c7.arrow\n",
       "Loading cached shuffled indices for dataset at /Users/fabio/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-c1eaa46e94dfbfd3.arrow\n",
-      "Loading cached processed dataset at /Users/fabio/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-9aaaa3770ef3f9bf.arrow\n",
-      "Loading cached processed dataset at /Users/fabio/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-7a8cbae367cafa72.arrow\n"
+      "  0%|          | 0/1 [00:00<?, ?ba/s]\n",
+      "  0%|          | 0/1 [00:00<?, ?ba/s]\n"
      ]
     }
    ],
@@ -638,8 +588,8 @@
     "    imdb[\"test\"].shuffle(seed=seed).select([i for i in list(range(NUM_TEST_EXAMPLES))])\n",
     ")\n",
     "\n",
-    "train_dataset = ImdbDataset(small_train_dataset)\n",
-    "test_dataset = ImdbDataset(small_test_dataset)\n",
+    "train_dataset = ImdbDataset(small_train_dataset, tokenizer=tokenizer)\n",
+    "test_dataset = ImdbDataset(small_test_dataset, tokenizer=tokenizer)\n",
     "\n",
     "train_dataloader = torch.utils.data.DataLoader(\n",
     "    train_dataset, batch_size=7, shuffle=True\n",
@@ -663,14 +613,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "K-FAC blocks - batch progress:   0%|          | 0/15 [00:00<?, ?it/s]"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "K-FAC blocks - batch progress: 100%|██████████| 15/15 [01:59<00:00,  7.98s/it]\n"
+      "K-FAC blocks - batch progress: 100%|██████████| 15/15 [01:52<00:00,  7.53s/it]\n"
      ]
     }
    ],
@@ -707,7 +650,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We calculate the influence of the first batch of training data over the first batch of test data. This because influence functions are very expensive to compute, and so to keep the runtime of this notebook within a few minutes we need to restrict ourselves a small number of examples."
+    "We calculate the influence of the first batch of training data over the first batch of test data. This is because influence functions are very expensive to compute, and so to keep the runtime of this notebook within a few minutes we need to restrict ourselves to a small number of examples."
    ]
   },
   {
@@ -925,14 +868,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "This review is also quite hard to classify. This time it has a negative sentiment towards the movie, but it also contains several words with positive connotation. The parallel with the previous review is quite interesting, since both talk about an invasion. "
+    "This review is also quite hard to classify. This time it has a negative sentiment towards the movie, but it also contains several words with positive connotation. The parallel with the previous review is quite interesting since both talk about an invasion. "
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "As it is often the case when analysing influence functions, it is hard to understand why these examples have such a large influence. We have seen some interesting patterns, mostly related to similarities in the language and words used, but it is hard to say with certainty if these are the reasons for the large influence.\n",
+    "As it is often the case when analysing influence functions, it is hard to understand why these examples have such a large influence. We have seen some interesting patterns, mostly related to similarities in the language and words used, but it is hard to say with certainty if these are the reasons for such a large influence.\n",
     "\n",
     "A [recent paper](https://arxiv.org/abs/2308.03296) has explored this topic in high detail, even for much larger language models than BERT (up to ~50 billion parameters!). Among the most interesting findings is that smaller models tend to rely a lot on word-to-word correspondencies, while larger models are more capable of extracting higher level concepts, drawing connections between words across multiple phrases.\n",
     "\n",
diff --git a/notebooks/support/torch.py b/notebooks/support/torch.py
@@ -255,6 +255,48 @@ def load(self) -> Losses:
             return pkl.load(file)
 
 
+class ImdbDataset(torch.utils.data.Dataset):
+    """
+    A PyTorch Dataset that takes in an HuggingFace Dataset object and tokenizes it.
+    The objects returned by __getitem__ are PyTorch tensors, with x being a tuple of
+    (input_ids, attention_mask), ready to be fed into a model, and y being the label.
+    It also returns the original text, for printing and debugging purposes.
+    """
+
+    def __init__(self, dataset, tokenizer):
+        self.tokenizer = tokenizer
+        self.tokenized_ds = dataset.map(self.preprocess_function, batched=True)
+        self.encodings = self.tokenized_ds["input_ids"]
+        self.attn_mask = self.tokenized_ds["attention_mask"]
+        self.labels = self.tokenized_ds["label"]
+
+    def preprocess_function(self, examples):
+        return self.tokenizer(examples["text"], truncation=True, padding=True)
+
+    def __getitem__(self, idx):
+        x = torch.tensor([self.encodings[idx], self.attn_mask[idx]])
+        y = torch.tensor(self.labels[idx])
+        text = self.tokenized_ds[idx]["text"]
+        return x, y, text
+
+    def __len__(self):
+        return len(self.labels)
+
+
+class ModelLogitsWrapper(torch.nn.Module):
+    """
+    A wrapper around a PyTorch model that returns only the logits and not the loss or
+    the attention mask.
+    """
+
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+
+    def forward(self, x):
+        return self.model(x[:, 0], x[:, 1]).logits
+
+
 def process_imgnet_io(
     df: pd.DataFrame, labels: dict
 ) -> Tuple[torch.Tensor, torch.Tensor]: