added MNIST downloader, adjusted minor thinks for the code to run

hzavadil98 · hzavadil98 · commit 15c99ea46fb2 · 2025-02-10T13:44:05.000+01:00
diff --git a/main.py b/main.py
@@ -41,9 +41,8 @@ def main():
 
     traindata, validata, testdata = load_data(
         args.dataset,
-        data_path=args.datafolder,
+        data_dir=args.datafolder,
         transform=transform,
-        download=args.download_data,
         val_size=args.val_size,
     )
 
diff --git a/utils/dataloaders/datasources.py b/utils/dataloaders/datasources.py
@@ -17,3 +17,22 @@
         "8ea070ee2aca1ac39742fdd1ef5ed118",
     ],
 }
+
+MNIST_SOURCE = {
+    "train_images": ["https://storage.googleapis.com/cvdf-datasets/mnist/train-images-idx3-ubyte.gz", 
+                     "train-images-idx3-ubyte", 
+                     None
+    ],
+    "train_labels": ["https://storage.googleapis.com/cvdf-datasets/mnist/train-labels-idx1-ubyte.gz",
+                    "train-labels-idx1-ubyte",
+                    None
+    ],
+    "test_images": ["https://storage.googleapis.com/cvdf-datasets/mnist/t10k-images-idx3-ubyte.gz",
+                    "t10k-images-idx3-ubyte",
+                    None
+    ],
+    "test_labels": ["https://storage.googleapis.com/cvdf-datasets/mnist/t10k-labels-idx1-ubyte.gz",
+                    "t10k-labels-idx1-ubyte",
+                    None
+    ],
+}
diff --git a/utils/dataloaders/download.py b/utils/dataloaders/download.py
@@ -1,13 +1,15 @@
 import bz2
 import hashlib
+import os
+import gzip
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from urllib.request import urlretrieve
 
 import h5py as h5
 import numpy as np
 
-from .datasources import USPS_SOURCE
+from .datasources import USPS_SOURCE, MNIST_SOURCE
 
 
 class Downloader:
@@ -38,7 +40,47 @@ class Downloader:
     """
 
     def mnist(self, data_dir: Path) -> tuple[np.ndarray, np.ndarray]:
-        raise NotImplementedError("MNIST download not implemented yet")
+        def _chech_is_downloaded(path: Path) -> bool:
+            path = path / "MNIST"
+            if path.exists():
+                required_files = [MNIST_SOURCE[key][1] for key in MNIST_SOURCE.keys()]
+                if all([(path / file).exists() for file in required_files]):
+                    print("MNIST Dataset already downloaded.")
+                    return True
+                else:
+                    return False
+            else:
+                path.mkdir(parents=True, exist_ok=True)
+                return False
+            
+        def _download_data(path: Path) -> None:
+            urls = {key: MNIST_SOURCE[key][0] for key in MNIST_SOURCE.keys()}
+
+            for name, url in urls.items():
+                file_path = os.path.join(path, url.split("/")[-1])
+                if not os.path.exists(file_path.replace(".gz", "")):  # Avoid re-downloading
+                    urlretrieve(url, file_path)
+                    with gzip.open(file_path, "rb") as f_in:
+                        with open(file_path.replace(".gz", ""), "wb") as f_out:
+                            f_out.write(f_in.read())
+                    os.remove(file_path)  # Remove compressed file
+                    
+        def _get_labels(path: Path) -> np.ndarray:
+            with open(path, "rb") as f:
+                data = np.frombuffer(f.read(), dtype=np.uint8, offset=8)
+            return data
+                    
+        if not _chech_is_downloaded(data_dir):
+            _download_data(data_dir)
+            
+        train_labels_path = data_dir / "MNIST" / MNIST_SOURCE["train_labels"][1]
+        test_labels_path = data_dir / "MNIST" / MNIST_SOURCE["test_labels"][1]
+        
+        train_labels = _get_labels(train_labels_path)
+        test_labels = _get_labels(test_labels_path)
+        
+        return train_labels, test_labels
+        
 
     def svhn(self, data_dir: Path) -> tuple[np.ndarray, np.ndarray]:
         raise NotImplementedError("SVHN download not implemented yet")
diff --git a/utils/dataloaders/mnist_0_3.py b/utils/dataloaders/mnist_0_3.py
@@ -1,154 +1,72 @@
-import gzip
-import os
-import urllib.request
 from pathlib import Path
 
 import numpy as np
-import torch
-from torch.utils.data import Dataset, random_split
+from torch.utils.data import Dataset
+from .datasources import MNIST_SOURCE
 
 
 class MNISTDataset0_3(Dataset):
     """
-    A custom dataset class for loading MNIST data, specifically for digits 0 through 3.
-
+    A custom Dataset class for loading a subset of the MNIST dataset containing digits 0 to 3.
     Parameters
     ----------
     data_path : Path
-        The root directory where the MNIST data is stored or will be downloaded.
+        The root directory where the MNIST data is stored.
+    sample_ids : list
+        A list of indices specifying which samples to load.
     train : bool, optional
-        If True, loads the training data, otherwise loads the test data. Default is False.
+        If True, load training data, otherwise load test data. Default is False.
     transform : callable, optional
-        A function/transform that takes in an image and returns a transformed version. Default is None.
-    download : bool, optional
-        If True, downloads the dataset if it is not already present in the specified data_path. Default is False.
-
+        A function/transform to apply to the images. Default is None.
     Attributes
     ----------
     data_path : Path
         The root directory where the MNIST data is stored.
     mnist_path : Path
-        The directory where the MNIST data files are stored.
+        The directory where the MNIST dataset is located within the root directory.
+    idx : list
+        A list of indices specifying which samples to load.
     train : bool
-        Indicates whether the training data or test data is being used.
+        Indicates whether to load training data or test data.
     transform : callable
-        A function/transform that takes in an image and returns a transformed version.
-    download : bool
-        Indicates whether the dataset should be downloaded if not present.
+        A function/transform to apply to the images.
+    num_classes : int
+        The number of classes in the dataset (0 to 3).
     images_path : Path
-        The path to the image file (training or test) based on the `train` flag.
+        The path to the image file (train or test) based on the `train` flag.
     labels_path : Path
-        The path to the label file (training or test) based on the `train` flag.
-    idx : numpy.ndarray
-        Indices of the labels that are less than 4.
+        The path to the label file (train or test) based on the `train` flag.
     length : int
         The number of samples in the dataset.
-
     Methods
     -------
-    _parse_labels(train)
-        Parses the labels from the label file.
-    _chech_is_downloaded()
-        Checks if the dataset is already downloaded.
-    _download_data()
-        Downloads and extracts the MNIST dataset.
     __len__()
         Returns the number of samples in the dataset.
     __getitem__(index)
-        Returns the image and label at the specified index.
+        Retrieves the image and label at the specified index.
     """
 
     def __init__(
         self,
-        split: str,
-        split_percentage: float,
         data_path: Path,
-        download: bool = False,
+        sample_ids: list,
+        train: bool = False,
         transform=None,
     ):
         super().__init__()
 
         self.data_path = data_path
         self.mnist_path = self.data_path / "MNIST"
-        self.split = split
-        self.split_percentage = split_percentage
+        self.idx = sample_ids
+        self.train = train
         self.transform = transform
-        self.download = download
         self.num_classes = 4
 
-        if self.split == "test":
-            train = False  # used to decide whether to load training or test dataset
-        else:
-            train = True
-
-        if not self.download and not self._chech_is_downloaded():
-            raise ValueError(
-                "Data not found. Set --download-data=True to download the data."
-            )
-        if self.download and not self._chech_is_downloaded():
-            self._download_data()
-
-        self.images_path = self.mnist_path / (
-            "train-images-idx3-ubyte" if train else "t10k-images-idx3-ubyte"
-        )
-        self.labels_path = self.mnist_path / (
-            "train-labels-idx1-ubyte" if train else "t10k-labels-idx1-ubyte"
-        )
-
-        labels = self._parse_labels()
-
-        self.idx = np.where(labels < 4)[0]
-
-        if self.split != "test":
-            generator1 = torch.Generator().manual_seed(42)
-            tr, val = random_split(
-                self.idx,
-                [1 - self.split_percentage, self.split_percentage],
-                generator=generator1,
-            )
-            self.idx = tr if self.split == "train" else val
+        self.images_path = self.mnist_path / (MNIST_SOURCE["train_images"][1] if train else MNIST_SOURCE["test_images"][1])
+        self.labels_path = self.mnist_path / (MNIST_SOURCE["train_labels"][1] if train else MNIST_SOURCE["test_labels"][1])
 
         self.length = len(self.idx)
-
-    def _parse_labels(self):
-        with open(self.labels_path, "rb") as f:
-            data = np.frombuffer(f.read(), dtype=np.uint8, offset=8)
-        return data
-
-    def _chech_is_downloaded(self):
-        if self.mnist_path.exists():
-            required_files = [
-                "train-images-idx3-ubyte",
-                "train-labels-idx1-ubyte",
-                "t10k-images-idx3-ubyte",
-                "t10k-labels-idx1-ubyte",
-            ]
-            if all([(self.mnist_path / file).exists() for file in required_files]):
-                print("MNIST Dataset already downloaded.")
-                return True
-            else:
-                return False
-        else:
-            self.mnist_path.mkdir(parents=True, exist_ok=True)
-            return False
-
-    def _download_data(self):
-        urls = {
-            "train_images": "https://storage.googleapis.com/cvdf-datasets/mnist/train-images-idx3-ubyte.gz",
-            "train_labels": "https://storage.googleapis.com/cvdf-datasets/mnist/train-labels-idx1-ubyte.gz",
-            "test_images": "https://storage.googleapis.com/cvdf-datasets/mnist/t10k-images-idx3-ubyte.gz",
-            "test_labels": "https://storage.googleapis.com/cvdf-datasets/mnist/t10k-labels-idx1-ubyte.gz",
-        }
-
-        for name, url in urls.items():
-            file_path = os.path.join(self.mnist_path, url.split("/")[-1])
-            if not os.path.exists(file_path.replace(".gz", "")):  # Avoid re-downloading
-                urllib.request.urlretrieve(url, file_path)
-                with gzip.open(file_path, "rb") as f_in:
-                    with open(file_path.replace(".gz", ""), "wb") as f_out:
-                        f_out.write(f_in.read())
-                os.remove(file_path)  # Remove compressed file
-
+        
     def __len__(self):
         return self.length
 
diff --git a/utils/load_data.py b/utils/load_data.py
@@ -43,19 +43,21 @@ def load_data(dataset: str, *args, **kwargs) -> tuple:
     >>> len(train), len(val), len(test)
     (4914, 546, 1782)
     """
-
+    downloader = Downloader()
+    data_dir = kwargs.get("data_dir")
+    transform = kwargs.get("transform")
     match dataset.lower():
         case "usps_0-6":
             dataset = USPSDataset0_6
-            train_labels, test_labels = Downloader.usps(*args, **kwargs)
+            train_labels, test_labels = downloader.usps(data_dir=data_dir)
             labels = np.arange(7)
         case "usps_7-9":
             dataset = USPSH5_Digit_7_9_Dataset
-            train_labels, test_labels = Downloader.usps(*args, **kwargs)
+            train_labels, test_labels = downloader.usps(data_dir=data_dir)
             labels = np.arange(7, 10)
         case "mnist_0-3":
             dataset = MNISTDataset0_3
-            train_labels, test_labels = Downloader.mnist(*args, **kwargs)
+            train_labels, test_labels = downloader.mnist(data_dir=data_dir)
             labels = np.arange(4)
         case _:
             raise NotImplementedError(f"Dataset: {dataset} not implemented.")
@@ -73,24 +75,24 @@ def load_data(dataset: str, *args, **kwargs) -> tuple:
     train_samples, val_samples = random_split(train_samples, [1 - val_size, val_size])
 
     train = dataset(
-        *args,
+        data_path=data_dir,
         sample_ids=train_samples,
         train=True,
-        **kwargs,
+        transform=transform,
     )
 
     val = dataset(
-        *args,
+        data_path=data_dir,
         sample_ids=val_samples,
         train=True,
-        **kwargs,
+        transform=transform,
     )
 
     test = dataset(
-        *args,
+        data_path=data_dir,
         sample_ids=test_samples,
         train=False,
-        **kwargs,
+        transform=transform,
     )
 
     return train, val, test

Original file line number	Diff line number	Diff line change
`@@ -41,9 +41,8 @@ def main():`
`41`	`41`
`42`	`42`	`traindata, validata, testdata = load_data(`
`43`	`43`	`args.dataset,`
`44`		`- data_path=args.datafolder,`
	`44`	`+ data_dir=args.datafolder,`
`45`	`45`	`transform=transform,`
`46`		`- download=args.download_data,`
`47`	`46`	`val_size=args.val_size,`
`48`	`47`	`)`
`49`	`48`