Remove downloading logic from USPS dataset

c-salomonsen · c-salomonsen · commit a9e2cade27ee · 2025-02-08T19:53:01.000+01:00
diff --git a/utils/dataloaders/usps_0_6.py b/utils/dataloaders/usps_0_6.py
@@ -4,20 +4,14 @@
 This module contains the Dataset class for the USPS dataset with labels 0-6.
 """
 
-import bz2
-import hashlib
 from pathlib import Path
-from tempfile import TemporaryDirectory
-from urllib.request import urlretrieve
 
 import h5py as h5
 import numpy as np
 from PIL import Image
 from torch.utils.data import Dataset
 from torchvision import transforms
 
-from .datasources import USPS_SOURCE
-
 
 class USPSDataset0_6(Dataset):
     """
@@ -87,178 +81,31 @@ class USPSDataset0_6(Dataset):
     def __init__(
         self,
         data_path: Path,
+        sample_ids: list,
         train: bool = False,
         transform=None,
-        download: bool = False,
     ):
         super().__init__()
 
         path = data_path if isinstance(data_path, Path) else Path(data_path)
         self.filepath = path / self.filename
         self.transform = transform
         self.mode = "train" if train else "test"
+        self.sample_ids = sample_ids
 
-        # Download the dataset if it does not exist in a temporary directory
-        # to automatically clean up the downloaded file
-        if download and not self._dataset_ok():
-            url, _, checksum = USPS_SOURCE[self.mode]
-
-            print(f"Downloading USPS dataset ({self.mode})...")
-            self.download(url, self.filepath, checksum, self.mode)
-
-        self.idx = self._index()
-
-    def _dataset_ok(self):
-        """Check if the dataset file exists and contains the required datasets."""
-
-        if not self.filepath.exists():
-            print(f"Dataset file {self.filepath} does not exist.")
-            return False
-
-        with h5.File(self.filepath, "r") as f:
-            for mode in ["train", "test"]:
-                if mode not in f:
-                    print(
-                        f"Dataset file {self.filepath} is missing the {mode} dataset."
-                    )
-                    return False
-
-        return True
-
-    def download(self, url, filepath, checksum, mode):
-        """Download the USPS dataset, and save it as an HDF5 file.
-
-        Args
-        ----
-        url : str
-            URL to download the dataset from.
-        filepath : pathlib.Path
-            Path to save the downloaded dataset.
-        checksum : str
-            MD5 checksum of the downloaded file.
-        mode : str
-            Mode of the dataset, either train or test.
-
-        Raises
-        ------
-        ValueError
-            If the checksum of the downloaded file does not match the expected checksum.
-        """
-
-        def reporthook(blocknum, blocksize, totalsize):
-            """Report download progress."""
-            denom = 1024 * 1024
-            readsofar = blocknum * blocksize
-            if totalsize > 0:
-                percent = readsofar * 1e2 / totalsize
-                s = f"\r{int(percent):^3}% {readsofar / denom:.2f} of {totalsize / denom:.2f} MB"
-                print(s, end="", flush=True)
-                if readsofar >= totalsize:
-                    print()
-
-        # Download the dataset to a temporary file
-        with TemporaryDirectory() as tmpdir:
-            tmpdir = Path(tmpdir)
-            tmpfile = tmpdir / "usps.bz2"
-            urlretrieve(
-                url,
-                tmpfile,
-                reporthook=reporthook,
-            )
-
-            # For fun we can check the integrity of the downloaded file
-            if not self.check_integrity(tmpfile, checksum):
-                errmsg = (
-                    "The checksum of the downloaded file does "
-                    "not match the expected checksum."
-                )
-                raise ValueError(errmsg)
-
-            # Load the dataset and save it as an HDF5 file
-            with bz2.open(tmpfile) as fp:
-                raw = [line.decode().split() for line in fp.readlines()]
-
-                tmp = [[x.split(":")[-1] for x in data[1:]] for data in raw]
-
-                imgs = np.asarray(tmp, dtype=np.float32)
-                imgs = ((imgs + 1) / 2 * 255).astype(dtype=np.uint8)
-
-                targets = [int(d[0]) - 1 for d in raw]
-
-        with h5.File(self.filepath, "a") as f:
-            f.create_dataset(f"{mode}/data", data=imgs, dtype=np.float32)
-            f.create_dataset(f"{mode}/target", data=targets, dtype=np.int32)
-
-    @staticmethod
-    def check_integrity(filepath, checksum):
-        """Check the integrity of the USPS dataset file.
-
-        Args
-        ----
-        filepath : pathlib.Path
-            Path to the USPS dataset file.
-        checksum : str
-            MD5 checksum of the dataset file.
-
-        Returns
-        -------
-        bool
-            True if the checksum of the file matches the expected checksum, False otherwise
-        """
-
-        file_hash = hashlib.md5(filepath.read_bytes()).hexdigest()
-
-        return checksum == file_hash
-
-    def _index(self):
-        with h5.File(self.filepath, "r") as f:
-            labels = f[self.mode]["target"][:]
-
-        # Get indices of samples with labels 0-6
-        mask = labels <= 6
-        idx = np.where(mask)[0]
+    def __len__(self):
+        return len(self.sample_ids)
 
-        return idx
+    def __getitem__(self, id):
+        index = self.sample_ids[id]
 
-    def _load_data(self, idx):
         with h5.File(self.filepath, "r") as f:
-            data = f[self.mode]["data"][idx].astype(np.uint8)
-            label = f[self.mode]["target"][idx]
+            data = f[self.mode]["data"][index].astype(np.uint8)
+            label = f[self.mode]["target"][index]
 
-        return data, label
-
-    def __len__(self):
-        return len(self.idx)
-
-    def __getitem__(self, idx):
-        data, target = self._load_data(self.idx[idx])
         data = Image.fromarray(data, mode="L")
 
-        # one hot encode the target
-        target = np.eye(self.num_classes, dtype=np.float32)[target]
-
         if self.transform:
             data = self.transform(data)
 
-        return data, target
-
-
-if __name__ == "__main__":
-    # Example usage:
-    transform = transforms.Compose(
-        [
-            transforms.Resize((16, 16)),
-            transforms.ToTensor(),
-        ]
-    )
-
-    dataset = USPSDataset0_6(
-        data_path="data",
-        train=True,
-        download=False,
-        transform=transform,
-    )
-    print(len(dataset))
-    data, target = dataset[0]
-    print(data.shape)
-    print(target)
+        return data, label