Merge pull request #33 from SFI-Visual-Intelligence/christian/rework-usps_0-6-dataset

c-salomonsen · web-flow · commit 0b21d9d204f3 · 2025-02-04T18:50:58.000+01:00
Christian/rework usps 0 6 dataset and add some functionality to main.py
diff --git a/environment.yml b/environment.yml
@@ -18,6 +18,7 @@ dependencies:
   - pytest
   - ruff
   - scalene
+  - tqdm
   - pip:
     - torch
     - torchvision
diff --git a/main.py b/main.py
@@ -6,6 +6,8 @@
 import torch.nn as nn
 import wandb
 from torch.utils.data import DataLoader
+from torchvision import transforms
+from tqdm import tqdm
 
 from utils import MetricWrapper, createfolders, load_data, load_model
 
@@ -49,15 +51,13 @@ def main():
     )
     parser.add_argument(
         "--savemodel",
-        type=bool,
-        default=False,
+        action="store_true",
         help="Whether model should be saved or not.",
     )
 
     parser.add_argument(
         "--download-data",
-        type=bool,
-        default=False,
+        action="store_true",
         help="Whether the data should be downloaded or not. Might cause code to start a bit slowly.",
     )
 
@@ -126,17 +126,27 @@ def main():
 
     metrics = MetricWrapper(*args.metric)
 
+    augmentations = transforms.Compose(
+        [
+            transforms.Resize((16, 16)),  # At least for USPS
+            transforms.ToTensor(),
+        ]
+    )
+
     # Dataset
     traindata = load_data(
         args.dataset,
         train=True,
         data_path=args.datafolder,
         download=args.download_data,
+        transform=augmentations,
     )
     validata = load_data(
         args.dataset,
         train=False,
         data_path=args.datafolder,
+        download=args.download_data,
+        transform=augmentations,
     )
 
     # Find the shape of the data, if is 2D, add a channel dimension
@@ -168,7 +178,27 @@ def main():
 
     # This allows us to load all the components without running the training loop
     if args.dry_run:
-        print("Dry run completed")
+        dry_run_loader = DataLoader(
+            traindata,
+            batch_size=1,
+            shuffle=True,
+            pin_memory=True,
+            drop_last=True,
+        )
+
+        for x, y in tqdm(dry_run_loader, desc="Dry run", total=1):
+            x, y = x.to(device), y.to(device)
+            pred = model.forward(x)
+
+            loss = criterion(y, pred)
+            loss.backward()
+
+            optimizer.step()
+            optimizer.zero_grad(set_to_none=True)
+
+            break
+
+        print("Dry run completed successfully.")
         exit(0)
 
     wandb.init(project="", tags=[])
@@ -178,7 +208,7 @@ def main():
         # Training loop start
         trainingloss = []
         model.train()
-        for x, y in trainloader:
+        for x, y in tqdm(trainloader, desc="Training"):
             x, y = x.to(device), y.to(device)
             pred = model.forward(x)
 
@@ -193,7 +223,7 @@ def main():
         # Eval loop start
         model.eval()
         with th.no_grad():
-            for x, y in valiloader:
+            for x, y in tqdm(valiloader, desc="Validation"):
                 x, y = x.to(device), y.to(device)
                 pred = model.forward(x)
                 loss = criterion(y, pred)
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,3 @@
+[tool.isort]
+profile = "black"
+line_length = 88
diff --git a/tests/test_dataloaders.py b/tests/test_dataloaders.py
@@ -3,17 +3,31 @@
 
 def test_uspsdataset0_6():
     from pathlib import Path
-    from tempfile import TemporaryFile
+    from tempfile import TemporaryDirectory
 
     import h5py
     import numpy as np
+    from torchvision import transforms
 
-    with TemporaryFile() as tf:
+    # Create a temporary directory (deleted after the test)
+    with TemporaryDirectory() as tempdir:
+        tempdir = Path(tempdir)
+
+        tf = tempdir / "usps.h5"
+
+        # Create a h5 file
         with h5py.File(tf, "w") as f:
+            # Populate the file with data
             f["train/data"] = np.random.rand(10, 16 * 16)
             f["train/target"] = np.array([6, 5, 4, 3, 2, 1, 0, 0, 0, 0])
 
-        dataset = USPSDataset0_6(data_path=tf, train=True)
+        trans = transforms.Compose(
+            [
+                transforms.Resize((16, 16)),  # At least for USPS
+                transforms.ToTensor(),
+            ]
+        )
+        dataset = USPSDataset0_6(data_path=tempdir, train=True, transform=trans)
         assert len(dataset) == 10
         data, target = dataset[0]
         assert data.shape == (1, 16, 16)
diff --git a/utils/dataloaders/datasources.py b/utils/dataloaders/datasources.py
@@ -0,0 +1,12 @@
+USPS_SOURCE = {
+    "train": [
+        "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/usps.bz2",
+        "usps.bz2",
+        "ec16c51db3855ca6c91edd34d0e9b197",
+    ],
+    "test": [
+        "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/usps.t.bz2",
+        "usps.t.bz2",
+        "8ea070ee2aca1ac39742fdd1ef5ed118",
+    ],
+}
diff --git a/utils/dataloaders/usps_0_6.py b/utils/dataloaders/usps_0_6.py
@@ -4,11 +4,19 @@
 This module contains the Dataset class for the USPS dataset with labels 0-6.
 """
 
+import bz2
+import hashlib
 from pathlib import Path
+from tempfile import TemporaryDirectory
+from urllib.request import urlretrieve
 
 import h5py as h5
 import numpy as np
+from PIL import Image
 from torch.utils.data import Dataset
+from torchvision import transforms
+
+from .datasources import USPS_SOURCE
 
 
 class USPSDataset0_6(Dataset):
@@ -28,7 +36,7 @@ class USPSDataset0_6(Dataset):
 
     Attributes
     ----------
-    path : pathlib.Path
+    filepath : pathlib.Path
         Path to the USPS dataset file.
     mode : str
         Mode of the dataset, either train or test.
@@ -63,6 +71,8 @@ class USPSDataset0_6(Dataset):
     6
     """
 
+    filename = "usps.h5"
+
     def __init__(
         self,
         data_path: Path,
@@ -71,18 +81,97 @@ def __init__(
         download: bool = False,
     ):
         super().__init__()
-        self.path = data_path
+
+        path = data_path if isinstance(data_path, Path) else Path(data_path)
+        self.filepath = path / self.filename
         self.transform = transform
-        self.num_classes = 7
+        self.num_classes = 7  # 0-6
+        self.mode = "train" if train else "test"
 
-        if download:
-            raise NotImplementedError("Download functionality not implemented.")
+        # Download the dataset if it does not exist in a temporary directory
+        # to automatically clean up the downloaded file
+        if download and not self._dataset_ok():
+            url, _, checksum = USPS_SOURCE[self.mode]
+
+            print(f"Downloading USPS dataset ({self.mode})...")
+            self.download(url, self.filepath, checksum, self.mode)
 
-        self.mode = "train" if train else "test"
         self.idx = self._index()
 
+    def _dataset_ok(self):
+        """Check if the dataset file exists and contains the required datasets."""
+
+        if not self.filepath.exists():
+            print(f"Dataset file {self.filepath} does not exist.")
+            return False
+
+        with h5.File(self.filepath, "r") as f:
+            for mode in ["train", "test"]:
+                if mode not in f:
+                    print(
+                        f"Dataset file {self.filepath} is missing the {mode} dataset."
+                    )
+                    return False
+
+        return True
+
+    def download(self, url, filepath, checksum, mode):
+        """Download the USPS dataset."""
+
+        def reporthook(blocknum, blocksize, totalsize):
+            """Report download progress."""
+            denom = 1024 * 1024
+            readsofar = blocknum * blocksize
+            if totalsize > 0:
+                percent = readsofar * 1e2 / totalsize
+                s = f"\r{int(percent):^3}% {readsofar / denom:.2f} of {totalsize / denom:.2f} MB"
+                print(s, end="", flush=True)
+                if readsofar >= totalsize:
+                    print()
+
+        # Download the dataset to a temporary file
+        with TemporaryDirectory() as tmpdir:
+            tmpdir = Path(tmpdir)
+            tmpfile = tmpdir / "usps.bz2"
+            urlretrieve(
+                url,
+                tmpfile,
+                reporthook=reporthook,
+            )
+
+            # For fun we can check the integrity of the downloaded file
+            if not self.check_integrity(tmpfile, checksum):
+                errmsg = (
+                    "The checksum of the downloaded file does "
+                    "not match the expected checksum."
+                )
+                raise ValueError(errmsg)
+
+            # Load the dataset and save it as an HDF5 file
+            with bz2.open(tmpfile) as fp:
+                raw = [line.decode().split() for line in fp.readlines()]
+
+                tmp = [[x.split(":")[-1] for x in data[1:]] for data in raw]
+
+                imgs = np.asarray(tmp, dtype=np.float32)
+                imgs = ((imgs + 1) / 2 * 255).astype(dtype=np.uint8)
+
+                targets = [int(d[0]) - 1 for d in raw]
+
+        with h5.File(self.filepath, "a") as f:
+            f.create_dataset(f"{mode}/data", data=imgs, dtype=np.float32)
+            f.create_dataset(f"{mode}/target", data=targets, dtype=np.int32)
+
+    @staticmethod
+    def check_integrity(filepath, checksum):
+        """Check the integrity of the USPS dataset file."""
+
+        file_hash = hashlib.md5(filepath.read_bytes()).hexdigest()
+
+        return checksum == file_hash
+
     def _index(self):
-        with h5.File(self.path, "r") as f:
+        with h5.File(self.filepath, "r") as f:
             labels = f[self.mode]["target"][:]
 
         # Get indices of samples with labels 0-6
@@ -92,8 +181,8 @@ def _index(self):
         return idx
 
     def _load_data(self, idx):
-        with h5.File(self.path, "r") as f:
-            data = f[self.mode]["data"][idx]
+        with h5.File(self.filepath, "r") as f:
+            data = f[self.mode]["data"][idx].astype(np.uint8)
             label = f[self.mode]["target"][idx]
 
         return data, label
@@ -103,16 +192,33 @@ def __len__(self):
 
     def __getitem__(self, idx):
         data, target = self._load_data(self.idx[idx])
-
-        data = data.reshape(16, 16)
+        data = Image.fromarray(data, mode="L")
 
         # one hot encode the target
         target = np.eye(self.num_classes, dtype=np.float32)[target]
 
-        # Add channel dimension
-        data = np.expand_dims(data, axis=0)
-
         if self.transform:
             data = self.transform(data)
 
         return data, target
+
+
+if __name__ == "__main__":
+    # Example usage:
+    transform = transforms.Compose(
+        [
+            transforms.Resize((16, 16)),
+            transforms.ToTensor(),
+        ]
+    )
+
+    dataset = USPSDataset0_6(
+        data_path="data",
+        train=True,
+        download=False,
+        transform=transform,
+    )
+    print(len(dataset))
+    data, target = dataset[0]
+    print(data.shape)
+    print(target)
diff --git a/utils/load_data.py b/utils/load_data.py
@@ -1,7 +1,6 @@
 from torch.utils.data import Dataset
 
-from .dataloaders import (MNISTDataset0_3, USPSDataset0_6,
-                          USPSH5_Digit_7_9_Dataset)
+from .dataloaders import MNISTDataset0_3, USPSDataset0_6, USPSH5_Digit_7_9_Dataset
 
 
 def load_data(dataset: str, *args, **kwargs) -> Dataset:

-Original file line number
+Diff line change
   - pytest
   - ruff
   - scalene
 +  - tqdm
   - pip:
     - torch
     - torchvision
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+[tool.isort]`
	`2`	`+profile = "black"`
	`3`	`+line_length = 88`