Add download functionality

c-salomonsen · c-salomonsen · commit 3b1a4422541a · 2025-02-04T12:15:47.000+01:00
diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py
@@ -3,17 +3,24 @@
 
 def test_uspsdataset0_6():
     from pathlib import Path
-    from tempfile import TemporaryFile
+    from tempfile import TemporaryDirectory
 
     import h5py
     import numpy as np
 
-    with TemporaryFile() as tf:
+    # Create a temporary directory (deleted after the test)
+    with TemporaryDirectory() as tempdir:
+        tempdir = Path(tempdir)
+
+        tf = tempdir / "usps.h5"
+
+        # Create a h5 file
         with h5py.File(tf, "w") as f:
+            # Populate the file with data
             f["train/data"] = np.random.rand(10, 16 * 16)
             f["train/target"] = np.array([6, 5, 4, 3, 2, 1, 0, 0, 0, 0])
 
-        dataset = USPSDataset0_6(data_path=tf, train=True)
+        dataset = USPSDataset0_6(data_path=tempdir, train=True)
         assert len(dataset) == 10
         data, target = dataset[0]
         assert data.shape == (1, 16, 16)
diff --git a/utils/dataloaders/datasources.py b/utils/dataloaders/datasources.py
@@ -0,0 +1,12 @@
+USPS_SOURCE = {
+    "train": [
+        "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/usps.bz2",
+        "usps.bz2",
+        "ec16c51db3855ca6c91edd34d0e9b197",
+    ],
+    "test": [
+        "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/usps.t.bz2",
+        "usps.t.bz2",
+        "8ea070ee2aca1ac39742fdd1ef5ed118",
+    ],
+}
diff --git a/utils/dataloaders/usps_0_6.py b/utils/dataloaders/usps_0_6.py
@@ -4,12 +4,18 @@
 This module contains the Dataset class for the USPS dataset with labels 0-6.
 """
 
+import bz2
+import hashlib
 from pathlib import Path
+from tempfile import TemporaryDirectory, TemporaryFile
+from urllib.request import urlretrieve
 
 import h5py as h5
 import numpy as np
 from torch.utils.data import Dataset
 
+from .datasources import USPS_SOURCE
+
 
 class USPSDataset0_6(Dataset):
     """
@@ -28,7 +34,7 @@ class USPSDataset0_6(Dataset):
 
     Attributes
     ----------
-    path : pathlib.Path
+    filepath : pathlib.Path
         Path to the USPS dataset file.
     mode : str
         Mode of the dataset, either train or test.
@@ -63,6 +69,8 @@ class USPSDataset0_6(Dataset):
     6
     """
 
+    filename = "usps.h5"
+
     def __init__(
         self,
         data_path: Path,
@@ -71,18 +79,78 @@ def __init__(
         download: bool = False,
     ):
         super().__init__()
-        self.path = data_path
+
+        path = data_path if isinstance(data_path, Path) else Path(data_path)
+        self.filepath = path / self.filename
         self.transform = transform
-        self.num_classes = 7
+        self.num_classes = 7  # 0-6
+        self.mode = "train" if train else "test"
 
+        # Download the dataset if it does not exist in a temporary directory
+        # to automatically clean up the downloaded file
         if download:
-            raise NotImplementedError("Download functionality not implemented.")
+            url, _, checksum = USPS_SOURCE[self.mode]
+
+            print(f"Downloading USPS dataset ({self.mode})...")
+            self.download(url, self.filepath, checksum, self.mode)
 
-        self.mode = "train" if train else "test"
         self.idx = self._index()
 
+    def download(self, url, filepath, checksum, mode):
+        """Download the USPS dataset."""
+
+        def reporthook(blocknum, blocksize, totalsize):
+            denom = 1024 * 1024
+            readsofar = blocknum * blocksize
+            if totalsize > 0:
+                percent = readsofar * 1e2 / totalsize
+                s = f"\r{int(percent):^3}% {readsofar / denom:.2f} of {totalsize / denom:.2f} MB"
+                print(s, end="", flush=True)
+                if readsofar >= totalsize:
+                    print()
+
+        with TemporaryDirectory() as tmpdir:
+            tmpdir = Path(tmpdir)
+            tmpfile = tmpdir / "usps.bz2"
+            urlretrieve(
+                url,
+                tmpfile,
+                reporthook=reporthook,
+            )
+
+            # For fun we can check the integrity of the downloaded file
+            if not self.check_integrity(tmpfile, checksum):
+                errmsg = (
+                    "The checksum of the downloaded file does "
+                    "not match the expected checksum."
+                )
+                raise ValueError(errmsg)
+
+            # Load the dataset and save it as an HDF5 file
+            with bz2.open(tmpfile) as fp:
+                raw = [line.decode().split() for line in fp.readlines()]
+
+                tmp = [[x.split(":")[-1] for x in data[1:]] for data in raw]
+
+                imgs = np.asarray(tmp, dtype=np.float32)
+                imgs = ((imgs + 1) / 2 * 255).astype(dtype=np.uint8)
+
+                targets = [int(d[0]) - 1 for d in raw]
+
+        with h5.File(self.filepath, "w") as f:
+            f.create_dataset(f"{mode}/data", data=imgs, dtype=np.float32)
+            f.create_dataset(f"{mode}/target", data=targets, dtype=np.int32)
+
+    @staticmethod
+    def check_integrity(filepath, checksum):
+        """Check the integrity of the USPS dataset file."""
+
+        file_hash = hashlib.md5(filepath.read_bytes()).hexdigest()
+
+        return checksum == file_hash
+
     def _index(self):
-        with h5.File(self.path, "r") as f:
+        with h5.File(self.filepath, "r") as f:
             labels = f[self.mode]["target"][:]
 
         # Get indices of samples with labels 0-6
@@ -92,7 +160,7 @@ def _index(self):
         return idx
 
     def _load_data(self, idx):
-        with h5.File(self.path, "r") as f:
+        with h5.File(self.filepath, "r") as f:
             data = f[self.mode]["data"][idx]
             label = f[self.mode]["target"][idx]
 
@@ -116,3 +184,11 @@ def __getitem__(self, idx):
             data = self.transform(data)
 
         return data, target
+
+
+if __name__ == "__main__":
+    dataset = USPSDataset0_6(data_path="data", train=True, download=True)
+    print(len(dataset))
+    data, target = dataset[0]
+    print(data.shape)
+    print(target)