Updated dataloader to fit with MNIST 4-9

Johanmkr · Johanmkr · commit 22df0a0b2623 · 2025-02-11T16:29:11.000+01:00
diff --git a/utils/dataloaders/mnist_4_9.py b/utils/dataloaders/mnist_4_9.py
@@ -1,64 +1,54 @@
-import gzip
-import os
-import urllib.request as ur
 from pathlib import Path
 
+import numpy as np
 from torch.utils.data import Dataset
 
+from .datasources import MNIST_SOURCE
 
 class MNIST_4_9(Dataset):
-    def __init__(self, datapath: Path, train: bool = False, download: bool = False):
+    """
+    MNIST dataset of numbers 4-9.
+
+    Parameters
+    ----------
+    data_path : Path
+        Root directory where MNIST dataset is stored
+    sample_ids : np.ndarray
+        Array of indices spcifying which samples to load. This determines the samples used by the dataloader.
+    train : bool, optional
+        Whether to train the model or not, by default False
+    """
+    def __init__(self, data_path: Path, sample_ids: np.ndarray, train: bool = False):
         super.__init__()
-        self.datapath = datapath
-        self.mnist_path = self.datapath / "MNIST"
+        self.data_path = data_path
+        self.mnist_path = self.data_path / "MNIST"
+        self.samples = sample_ids
         self.train = train
-        self.download = download
-        self.num_classes: int = 6
-
-        if not self.download and not self._already_downloaded():
-            raise FileNotFoundError(
-                "Data files are not found. Set --download-data=True to download the data"
-            )
-        if self.download and not self._already_downloaded():
-            self._download()
-
-    def _download(self):
-        urls: dict = {
-            "train_images": "https://storage.googleapis.com/cvdf-datasets/mnist/train-images-idx3-ubyte.gz",
-            "train_labels": "https://storage.googleapis.com/cvdf-datasets/mnist/train-labels-idx1-ubyte.gz",
-            "test_images": "https://storage.googleapis.com/cvdf-datasets/mnist/t10k-images-idx3-ubyte.gz",
-            "test_labels": "https://storage.googleapis.com/cvdf-datasets/mnist/t10k-labels-idx1-ubyte.gz",
-        }
-
-        for url in urls.values():
-            file_path: Path = os.path.join(self.mnist_path, url.split("/")[-1])
-            file_name: Path = file_path.replace(".gz", "")
-            if os.path.exists(file_name):
-                print(f"File: {file_name} already downloaded")
-            else:
-                print(f"File: {file_name} is downloading...")
-                ur.urlretrieve(url, file_path)  # Download file
-                with gzip.open(file_path, "rb") as infile:
-                    with open(file_name, "wb") as outfile:
-                        outfile.write(infile.read())  # Write from url to local file
-                    os.remove(file_path)  # remove .gz file
-
-    def _already_downloaded(self):
-        if self.mnist_path.exists():
-            required_files: list = [
-                "train-images-idx3-ubyte",
-                "train-labels-idx1-ubyte",
-                "t10k-images-idx3-ubyte",
-                "t10k-labels-idx1-ubyte",
-            ]
-            return all([(self.mnist_path / file).exists() for file in required_files])
-
-        else:
-            self.mnist_path.mkdir(parents=True, exist_ok=True)
-            return False
-
+        
+        self.images_path = self.mnist_path / (
+            MNIST_SOURCE["train_images"][1] if train else MNIST_SOURCE["test_images"][1]
+        )
+        self.labels_path = self.mnist_path / (
+            MNIST_SOURCE["train_labels"][1] if train else MNIST_SOURCE["test_labels"][1]
+        )
+        
+        
     def __len__(self):
-        pass
-
-    def __getitem__(self):
-        pass
+        return len(self.samples)
+    
+    def __getitem__(self, idx):
+        with open(self.labels_path, "rb") as labelfile:
+            label_pos = 8 + self.sample[idx]
+            labelfile.seek(label_pos) 
+            label = int.from_bytes(labelfile.read(1), byteorder="big") 
+
+        with open(self.images_path, "rb") as imagefile:
+            image_pos = 16 + self.samples[idx] * 28 * 28
+            imagefile.seek(image_pos)
+            image = np.frombuffer(imagefile.read(28 * 28), dtype=np.uint8).reshape(
+                28, 28
+            ) 
+
+        image = np.expand_dims(image, axis=0)  # Channel
+        
+        return image, label