Change num_workers for imagenet, add validation tests for step times

rka97 · rka97 · commit f6974ebb8eb7 · 2026-01-12T04:19:34.000Z
diff --git a/algoperf/workloads/cifar/cifar_pytorch/workload.py b/algoperf/workloads/cifar/cifar_pytorch/workload.py
@@ -110,12 +110,12 @@ def _build_dataset(
       batch_size=ds_iter_batch_size,
       shuffle=not USE_PYTORCH_DDP and is_train,
       sampler=sampler,
-      num_workers=4 if is_train else self.eval_num_workers,
+      num_workers=2 * N_GPUS if is_train else self.eval_num_workers,
       pin_memory=True,
       drop_last=is_train,
     )
-    dataloader = data_utils.PrefetchedWrapper(dataloader, DEVICE)
     dataloader = data_utils.cycle(dataloader, custom_sampler=USE_PYTORCH_DDP)
+    dataloader = data_utils.dataloader_iterator_wrapper(dataloader, DEVICE)
     return dataloader
 
   def init_model_fn(self, rng: spec.RandomState) -> spec.ModelInitState:
diff --git a/algoperf/workloads/imagenet_resnet/imagenet_pytorch/workload.py b/algoperf/workloads/imagenet_resnet/imagenet_pytorch/workload.py
@@ -254,10 +254,11 @@ def _build_dataset(
       batch_size=ds_iter_batch_size,
       shuffle=not USE_PYTORCH_DDP and is_train,
       sampler=sampler,
-      num_workers=4 if is_train else self.eval_num_workers,
+      num_workers=5 * N_GPUS if is_train else self.eval_num_workers,
       pin_memory=True,
       drop_last=is_train,
       persistent_workers=is_train,
+      prefetch_factor=N_GPUS,
     )
     dataloader = data_utils.PrefetchedWrapper(dataloader, DEVICE)
     dataloader = data_utils.cycle(
@@ -266,7 +267,6 @@ def _build_dataset(
       use_mixup=use_mixup,
       mixup_alpha=0.2,
     )
-
     return dataloader
 
   def init_model_fn(self, rng: spec.RandomState) -> spec.ModelInitState:
diff --git a/algoperf/workloads/imagenet_vit/imagenet_pytorch/models.py b/algoperf/workloads/imagenet_vit/imagenet_pytorch/models.py
@@ -5,7 +5,6 @@
 and https://github.com/lucidrains/vit-pytorch.
 """
 
-import math
 from typing import Any, Optional, Tuple, Union
 
 import torch
@@ -126,13 +125,14 @@ def forward(self, x: spec.Tensor, dropout_rate: float) -> spec.Tensor:
     value_layer = self.transpose_for_scores(self.value(x))
     query_layer = self.transpose_for_scores(mixed_query_layer)
 
-    attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-    attention_scores = attention_scores / math.sqrt(self.head_dim)
-
-    attention_probs = F.softmax(attention_scores, dim=-1)
-    attention_probs = F.dropout(attention_probs, dropout_rate, self.training)
+    # Use built-in scaled_dot_product_attention (Flash Attention when available)
+    context_layer = F.scaled_dot_product_attention(
+      query_layer,
+      key_layer,
+      value_layer,
+      dropout_p=dropout_rate if self.training else 0.0,
+    )
 
-    context_layer = torch.matmul(attention_probs, value_layer)
     context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
     new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_dim,)
     context_layer = context_layer.view(new_context_layer_shape)
diff --git a/algorithms/baselines/external_tuning/pytorch_nadamw_full_budget.py b/algorithms/baselines/external_tuning/pytorch_nadamw_full_budget.py
@@ -5,7 +5,6 @@
 
 import torch
 import torch.distributed.nn as dist_nn
-from absl import logging
 from torch import Tensor
 from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR, SequentialLR
 
diff --git a/benchmark_step_times.py b/benchmark_step_times.py
diff --git a/submission_runner.py b/submission_runner.py
@@ -256,7 +256,6 @@ def train_once(
         'librispeech_conformer',
         'ogbg',
         'criteo1tb',
-        'imagenet_vit',
         'librispeech_deepspeech',
       ]
       eager_backend_workloads = []
@@ -266,6 +265,7 @@ def train_once(
         'librispeech_deepspeech',
         'ogbg',
         'wmt',
+        'imagenet_vit',
       ]
       base_workload = workloads.get_base_workload_name(workload_name)
       if base_workload in compile_error_workloads:
@@ -411,9 +411,8 @@ def train_once(
     train_step_end_time = get_time()
     if global_step == 11:
       step_10_end_time = train_step_end_time
-    
+
     # Log step time every 100 steps
-    # Note: global_step was incremented, so use (global_step - 1) to match
     if (global_step - 1) % 100 == 0 and workload.metrics_logger is not None:
       if step_10_end_time is not None and global_step > 11:
         elapsed_time_ms = (train_step_end_time - step_10_end_time) * 1000.0
diff --git a/tests/test_step_times.py b/tests/test_step_times.py