tuning the toy model

LZHMS · LZHMS · commit f986181fc9a6 · 2025-12-04T13:59:39.000+08:00
diff --git a/base/base_config.py b/base/base_config.py
@@ -102,6 +102,10 @@ def __init__(self):
     ###########################
     cfg.MODEL = CN()
     cfg.MODEL.NAME = ""
+    cfg.MODEL.MLP = CN()
+    cfg.MODEL.MLP.INPUT_DIM = 784
+    cfg.MODEL.MLP.HIDDEN_DIM = [128, 64]
+    cfg.MODEL.MLP.OUTPUT_DIM = 10
     # Path to model weights (for initialization)
     cfg.MODEL.INIT_WEIGHTS = ""
     # Definition of embedding layers
diff --git a/base/base_dataset.py b/base/base_dataset.py
@@ -158,9 +158,12 @@ def split_train_val(self, val_percent, seed=42):
         val_indices = indices[:val_size]
         train_indices = indices[val_size:]
         
+        # Save original train data before reassigning
+        original_train = self._train
+        
         # Create new splits
-        self._train = [self._train[i] for i in train_indices]
-        self._val = [self._train[i] for i in val_indices]
+        self._train = [original_train[i] for i in train_indices]
+        self._val = [original_train[i] for i in val_indices]
         logger.info(f"Split complete: {len(self._train)} train, {len(self._val)} val")
 
     def download_data(self, url, dst, from_gdrive=True):
diff --git a/base/base_trainer.py b/base/base_trainer.py
@@ -242,7 +242,7 @@ def set_model_mode(self, mode="train", names=None):
             > close_writer
             > write_scalar
     """
-    def init_writer(self, log_dir, extra_config=None):
+    def init_writer(self, extra_config=None):
         # Only initialize writer on main process
         if not self.is_main_process():
             return
diff --git a/config/toy_trainer_config.yaml b/config/toy_trainer_config.yaml
@@ -29,7 +29,10 @@ DATALOADER:
 
 MODEL:
   NAME: ToyModel
-  INIT_WEIGHTS: 
+  MLP:
+    INPUT_DIM: 784
+    HIDDEN_DIM: [128, 64]
+    OUTPUT_DIM: 10
 
 OPTIM:
   NAME: adam
diff --git a/dataset/HDTF_TFHP.py b/dataset/HDTF_TFHP.py
@@ -23,8 +23,7 @@ class HDTF_TFHP(DatasetBase):
 
     def __init__(self, cfg):
         # data config and path
-        root = os.path.abspath(os.path.expanduser(cfg.ROOT))
-        self.dataset_dir = os.path.join(root, cfg.NAME)
+        self.dataset_dir = os.path.join(cfg.ROOT, cfg.NAME)
         lmdb_path = self.dataset_dir
         split_path = [os.path.join(self.dataset_dir, cfg.HDTF_TFHP.TRAIN),
                            os.path.join(self.dataset_dir, cfg.HDTF_TFHP.VAL),
diff --git a/dataset/MINIST.py b/dataset/MINIST.py
@@ -17,14 +17,6 @@ class MNIST(DatasetBase):
     """
     
     def __init__(self, cfg):
-        # Data config and path
-        print(cfg.ROOT)
-        root = os.path.abspath(os.path.expanduser(cfg.ROOT))
-        print(root)
-        self.dataset_dir = os.path.join(root, cfg.NAME)
-        print(self.dataset_dir)
-        os.makedirs(self.dataset_dir, exist_ok=True)   # Create directory if not exists
-        
         # Define transformations
         self.transform = transforms.Compose([
             transforms.ToTensor(),
@@ -34,13 +26,13 @@ def __init__(self, cfg):
         # Load MNIST train and test datasets (will download if not exists)
         try:
             train_dataset = datasets.MNIST(
-                root=self.dataset_dir,
+                root=cfg.ROOT,
                 train=True,
                 download=True,
                 transform=self.transform
             )
             test_dataset = datasets.MNIST(
-                root=self.dataset_dir,
+                root=cfg.ROOT,
                 train=False,
                 download=True,
                 transform=self.transform
diff --git a/models/lib/network/mlp.py b/models/lib/network/mlp.py
@@ -1,4 +1,5 @@
 import functools
+import torch
 import torch.nn as nn
 
 # from ..head.build import HEAD_REGISTRY
@@ -10,16 +11,23 @@ def __init__(
         self,
         in_features=2048,
         hidden_layers=[],
+        out_features=None,
         activation="relu",
         bn=True,
         dropout=0.0,
+        
     ):
         super().__init__()
         if isinstance(hidden_layers, int):
             hidden_layers = [hidden_layers]
 
         assert len(hidden_layers) > 0
-        self.out_features = hidden_layers[-1]
+        
+        # If out_features is not specified, use the last hidden layer dimension
+        if out_features is None:
+            out_features = hidden_layers[-1]
+        self.out_features = out_features
+        self.in_features = in_features
 
         mlp = []
 
@@ -33,15 +41,23 @@ def __init__(
         for hidden_dim in hidden_layers:
             mlp += [nn.Linear(in_features, hidden_dim)]
             if bn:
-                mlp += [nn.BatchNorm1d(hidden_dim)]
+                mlp += [nn.LayerNorm(hidden_dim)]
             mlp += [act_fn()]
             if dropout > 0:
                 mlp += [nn.Dropout(dropout)]
             in_features = hidden_dim
 
+        # Add final projection layer if output dimension differs from last hidden layer
+        if out_features != hidden_layers[-1]:
+            mlp += [nn.Linear(hidden_layers[-1], out_features)]
+
         self.mlp = nn.Sequential(*mlp)
 
     def forward(self, x):
+        # Flatten input if it has more than 2 dimensions
+        if x.dim() > 2:
+            x = x.view(x.size(0), -1)
+        
         return self.mlp(x)
 
 
diff --git a/models/toymodel.py b/models/toymodel.py
@@ -7,10 +7,11 @@
 
 class ToyModel(nn.Module):
     """Simple MLP for MNIST digit recognition"""
-    def __init__(self):
+    def __init__(self, cfg):
       super(ToyModel, self).__init__()
-      self.net = MLP(in_features=1*28*28,
-                     hidden_layers=[20, 10],
+      self.net = MLP(in_features=cfg.INPUT_DIM,
+                     hidden_layers=cfg.HIDDEN_DIM,
+                     out_features=cfg.OUTPUT_DIM,
                      activation='relu',
                      bn=True, dropout=0.1)
 
diff --git a/output/MNIST/ToyTrainer/GPU0/output.log b/output/MNIST/ToyTrainer/GPU0/output.log
diff --git a/trainers/toy_trainer.py b/trainers/toy_trainer.py