Merge branch 'johan/dataloader' into johan/micromacro

Johanmkr · Johanmkr · commit 8d6c07a2f6b3 · 2025-02-11T09:25:23.000+01:00
diff --git a/.python-version b/.python-version
@@ -0,0 +1 @@
+3.12
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,3 +1,27 @@
+[project]
+name = "collaborative-coding-exam"
+version = "0.1.0"
+description = "Exam project in the collaborative coding course."
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "black>=25.1.0",
+    "h5py>=3.12.1",
+    "isort>=6.0.0",
+    "jupyterlab>=4.3.5",
+    "numpy>=2.2.2",
+    "pandas>=2.2.3",
+    "pip>=25.0",
+    "pytest>=8.3.4",
+    "ruff>=0.9.4",
+    "scalene>=1.5.51",
+    "sphinx>=8.1.3",
+    "sphinx-autoapi>=3.4.0",
+    "sphinx-autobuild>=2024.10.3",
+    "sphinx-rtd-theme>=3.0.2",
+    "torch>=2.6.0",
+    "torchvision>=0.21.0",
+]
 [tool.isort]
 profile = "black"
 line_length = 88
diff --git a/utils/dataloaders/mnist_4_9.py b/utils/dataloaders/mnist_4_9.py
@@ -0,0 +1,75 @@
+import gzip
+import os
+import urllib.request as ur
+from pathlib import Path
+import numpy as np
+from torch.utils.data import Dataset
+
+class MNIST_4_9(Dataset):
+    def __init__(self,
+                 datapath: Path,
+                 train: bool = False,
+                 download: bool = False
+    ):
+        super.__init__()
+        self.datapath = datapath
+        self.mnist_path = self.datapath / "MNIST"
+        self.train = train
+        self.download = download
+        self.num_classes: int = 6
+        
+        if not self.download and not self._already_downloaded():
+            raise FileNotFoundError(
+                'Data files are not found. Set --download-data=True to download the data'
+            )
+        if self.download and not self._already_downloaded():
+            self._download()
+            
+        
+        
+    
+    def _download(self):
+        urls: dict = {
+            "train_images": "https://storage.googleapis.com/cvdf-datasets/mnist/train-images-idx3-ubyte.gz",
+            "train_labels": "https://storage.googleapis.com/cvdf-datasets/mnist/train-labels-idx1-ubyte.gz",
+            "test_images": "https://storage.googleapis.com/cvdf-datasets/mnist/t10k-images-idx3-ubyte.gz",
+            "test_labels": "https://storage.googleapis.com/cvdf-datasets/mnist/t10k-labels-idx1-ubyte.gz",
+        }
+        
+        
+        for url in urls.values():
+            file_path: Path = os.path.join(self.mnist_path, url.split('/')[-1])
+            file_name: Path = file_path.replace('.gz','')
+            if os.path.exists(file_name):
+                print(f"File: {file_name} already downloaded")
+            else:
+                print(f"File: {file_name} is downloading...")
+                ur.urlretrieve(url, file_path) # Download file
+                with gzip.open(file_path, 'rb') as infile:
+                    with open(file_name, 'wb') as outfile:
+                        outfile.write(infile.read()) # Write from url to local file
+                    os.remove(file_path) # remove .gz file
+                    
+                    
+    
+    def _already_downloaded(self):
+        if self.mnist_path.exists():
+            required_files: list = [
+                "train-images-idx3-ubyte",
+                "train-labels-idx1-ubyte",
+                "t10k-images-idx3-ubyte",
+                "t10k-labels-idx1-ubyte",
+            ]
+            return all([(self.mnist_path / file).exists() for file in required_files])
+
+        else:
+            self.mnist_path.mkdir(parents=True, exist_ok=True)
+            return False
+    
+    def __len__(self):
+        pass
+    
+    def __getitem__(self):
+        pass
+    
+    
diff --git a/uv.lock b/uv.lock