fixed compiling issue

codinglabsong · codinglabsong · commit 9cbaf3168a75 · 2025-06-03T22:58:27.000+09:00
diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@ Supports local development, SageMaker training, flexible dataset prep, Weights &
 - Logging and experiment tracking (Weights & Biases)
 - Model checkpointing and flexible configuration
 - Ready for deployment (Gradio web app)
-- Mixed precision training (with `autocast` and `GradScaler`) for improved speed and memory efficiency on GPU
+- Gradient clipping, OneCycle LR policy, and mixed precision training (with `autocast` and `GradScaler`) for improved stability and GPU memory efficiency
 
 ## Project Structure
 ```graphql
@@ -37,7 +37,7 @@ Supports local development, SageMaker training, flexible dataset prep, Weights &
 ## Quick Start
 ### 1. Clone & Install
 ```bash
-git clone https://github.com/<your-username>/food101-classifier.git
+git clone https://github.com/codinglabsong/food101-end2end-classifier-sagemaker-gradio.git
 cd food101-classifier
 pip install -r requirements.txt
 ```
@@ -90,8 +90,8 @@ Edit `.env` using `.env.example` as a guide for AWS and wandb keys.
 > The preprocessing pipeline (image resizing, cropping, normalization) **must be identical** between training and inference (including Gradio app or deployment).
 >
 > - All transforms should use parameters from `config/prod.yaml` (or your config file).
-> - The value of `img_size` used for training and inference must always be ≤ 256, since images are first resized so their short edge is 256 before center cropping.  
-> - **Do not set `img_size` greater than 256.** This would result in errors or ineffective cropping during inference.
+> - The value of `img_size` used for training and inference must always be ≤ 512, since images are first resized so their short edge is 512 before center cropping.  
+> - **Do not set `img_size` greater than 512.** This would result in errors or ineffective cropping during inference.
 
 **Best practice:**  
 Update only your config file (not hardcoded values) when changing image size or normalization, and always reload configs in both training and inference code.
@@ -112,8 +112,8 @@ This project includes an interactive Gradio app for making predictions with the
 
 ## Requirements
 - See `requirements.txt`
-- Python ≥ 3.8
-- PyTorch >= 2.2
+- Python >= 3.9
+- PyTorch >= 2.6
 
 ## Contributing
 Open to issues and pull requests!
@@ -128,4 +128,4 @@ This project is licensed under the MIT License.
 
 ## Tips:
 - .env.example helps keep secrets out of git.
-- .gitignore: Don't track datasets, outputs, or .env.
+- .gitignore: Don't track datasets, outputs, wandb, or .env.
diff --git a/config/prod.yaml b/config/prod.yaml
@@ -1,19 +1,19 @@
 estimator:
   hyperparameters:
     seed: 42
-    batch_size: 128
-    num_epochs_phase1: 10
-    num_epochs_phase2: 8
-    lr_head: 4e-3
-    lr_backbone: 4e-4
+    batch-size: 128
+    num-epochs-phase1: 8
+    num-epochs-phase2: 10
+    lr-head: 4e-3
+    lr-backbone: 5e-4
     patience: 3
-    num_workers: 3
-    img_size: 224
+    num-workers: 2
+    img-size: 224
   instance_count: 1
-  instance_type: "ml.m5.xlarge"
-  framework_version: "2.2.0"
-  py_version: "py310"
-  base_job_name: "mnist-cnn"
+  instance_type: "ml.g4dn.xlarge"
+  framework_version: "2.6.0"
+  py_version: "py312"
+  base_job_name: "food101-classifier"
   use_spot_instances: true
-  max_run: 7200     # seconds
+  max_run: 10800    # seconds
   max_wait: 14400   # seconds, needed when using spot instances or otherwise wait indefinitely
diff --git a/gradio_app.py b/gradio_app.py
@@ -35,7 +35,7 @@ def build_model(num_classes):
 
 # 4. Preprocessing: same as test transforms in train.py
 preprocess = transforms.Compose([
-    transforms.Resize(512),
+    transforms.Resize(256),
     transforms.CenterCrop(cfg["estimator"]["hyperparameters"]["img_size"]),
     transforms.ToTensor(),
     transforms.Normalize([0.485,0.456,0.406], 
diff --git a/src/train.py b/src/train.py
@@ -15,14 +15,14 @@ def parse_args():
     
     # hyperparameters sent by the client (same flag names as estimator hyperparameters)
     p.add_argument("--seed", type=int, default=42)
-    p.add_argument("--batch-size", type=int, default=512)
-    p.add_argument("--num-epochs-phase1", type=int, default=2)
-    p.add_argument("--num-epochs-phase2", type=int, default=2)
-    p.add_argument("--lr-head", type=float, default=16e-3)
-    p.add_argument("--lr-backbone", type=float, default=16e-4)
+    p.add_argument("--batch-size", type=int, default=128)
+    p.add_argument("--num-epochs-phase1", type=int, default=10)
+    p.add_argument("--num-epochs-phase2", type=int, default=8)
+    p.add_argument("--lr-head", type=float, default=4e-3)
+    p.add_argument("--lr-backbone", type=float, default=4e-4)
     p.add_argument("--patience", type=int, default=3)
     p.add_argument("--num-workers", type=int, default=2)
-    p.add_argument("--img-size", type=int, default=384)
+    p.add_argument("--img-size", type=int, default=224)
     
     # other variables
     p.add_argument("--wandb-project", type=str, default="food101-classifier")
@@ -119,7 +119,7 @@ def main():
                             [0.229,0.224,0.225])
     ])
     test_tfms = transforms.Compose([
-        transforms.Resize(512),                             # shrink so short edge=256
+        transforms.Resize(256),                             # shrink so short edge=256
         transforms.CenterCrop(cfg.img_size),                # take middle window
         transforms.ToTensor(),
         transforms.Normalize([0.485,0.456,0.406],           
@@ -151,7 +151,7 @@ def main():
     test_dl = DataLoader(test_ds, batch_size=cfg.batch_size, num_workers=cfg.num_workers, pin_memory=True)
     
     print(f"Data ready. len(train)={len(train_ds)}, len(val)={len(val_ds)}, len(test)={len(test_ds)}")
-    
+       
     # ---------- Model Training Preparation ----------
     # create the model
     def build_model(num_classes: int) -> nn.Module:
@@ -175,14 +175,6 @@ def build_model(num_classes: int) -> nn.Module:
     print(f"number of class labels: {len(class_names)}")
     model = build_model(len(class_names))
     
-    # try compile if supported:
-    if DEVICE.type == "cuda" and torch.cuda.is_available():
-        cap = torch.cuda.get_device_properties(DEVICE).major
-        if cap >= 7:
-            model = torch.compile(model)
-        else:
-            print(f"GPU CC {cap}.x detected - skipping torch.compile()")
-
     criterion = nn.CrossEntropyLoss() # standard multi-class loss
     
     # one epoch function
@@ -192,7 +184,6 @@ def build_model(num_classes: int) -> nn.Module:
     else:
         scaler = None
     
-    step_counters = {'train': 0, 'val': 0}
     def epoch_loop (phase: str, 
                     model: nn.Module, 
                     loader: DataLoader, 
@@ -240,12 +231,11 @@ def epoch_loop (phase: str,
                 run_correct += (outputs.argmax(1) == y).sum().item()
                 imgs_processed += batch_size                                                # add to throughput counter
 
-                # wandb: batch logging (train & val only)
-                if phase in ["train", "val"]:
+                # wandb: batch logging (train only)
+                if is_train:
                     wandb.log({
-                        f"{phase}/batch_loss": loss.item(),
-                    }, step=step_counters[phase])
-                    step_counters[phase] += 1
+                        f"train/batch_loss": loss.item(),
+                    })
             
         if torch.cuda.is_available():
             torch.cuda.synchronize()                                                        # CPU waits until GPU finishes. More accurate dt.
@@ -278,7 +268,7 @@ def epoch_loop (phase: str,
                     f"{phase}/loss_scale": loss_scale,
                     f"{phase}/peak_mem_MB": peak_mem_MB,
                 })
-            wandb.log(metrics, step=step_counters[phase] - 1)                               # ensures logging at the same step as the last batch of that epoch
+            wandb.log(metrics)                               # ensures logging at the same step as the last batch of that epoch
         return epoch_loss, epoch_acc
     
     # checkpoint helper
@@ -302,7 +292,7 @@ def save_ckpt(state: Dict, filename: str, model_dir: str) -> None:
         optimizer,
         max_lr=cfg.lr_head,
         total_steps=total_steps,
-        pct_start=0.2,            # 20% of total steps for LR warm-up
+        pct_start=0.35,            # 35% of total steps for LR warm-up
         anneal_strategy="cos",    # cosine annealing down
     )
 
@@ -335,17 +325,23 @@ def save_ckpt(state: Dict, filename: str, model_dir: str) -> None:
     print("\nPhase 2: fine-tune")
     
     # unfreeze backbone
+    print("\nUnfreezing backbone...")
     for p in model.parameters():
         p.requires_grad = True
+                    
+    if torch.cuda.is_available() and torch.cuda.get_device_properties(DEVICE).major >= 7:
+        torch.cuda.empty_cache() # free the memory (helpful, but optional)
+        model = torch.compile(model) 
+        print(f"GPU CC {torch.cuda.get_device_properties(DEVICE).major}.x detected - compiled model")
 
-    optimizer = optim.Adam(model.parameters(), lr=cfg.lr_backbone)
+    optimizer = optim.Adam(model.parameters(), lr=cfg.lr_backbone)  
     total_steps = cfg.num_epochs_phase2 * n_steps_per_epoch
 
     scheduler = OneCycleLR(
         optimizer,
         max_lr=cfg.lr_backbone,
         total_steps=total_steps,
-        pct_start=0.2,
+        pct_start=0.15,
         anneal_strategy="cos",
     )