Merge branch 'NVIDIA:master' into fix_nnunet_lowres_axis

Elakkyen · web-flow · commit 6796d69fa658 · 2023-06-21T13:35:47.000+02:00
diff --git a/MxNet/Classification/RN50v1.5/dali.py b/MxNet/Classification/RN50v1.5/dali.py
@@ -31,7 +31,7 @@ def add_dali_args(parser):
     group.add_argument('--dali-validation-threads', type=int, default=10, help="number of threads" +\
                        "per GPU for DALI for validation")
     group.add_argument('--dali-prefetch-queue', type=int, default=5, help="DALI prefetch queue depth")
-    group.add_argument('--dali-nvjpeg-memory-padding', type=int, default=256, help="Memory padding value for nvJPEG (in MB)")
+    group.add_argument('--dali-nvjpeg-memory-padding', type=int, default=64, help="Memory padding value for nvJPEG (in MB)")
     group.add_argument('--dali-fuse-decoder', type=int, default=1, help="0 or 1 whether to fuse decoder or not")
 
     group.add_argument('--dali-nvjpeg-width-hint', type=int, default=5980, help="Width hint value for nvJPEG (in pixels)")
diff --git a/MxNet/Classification/RN50v1.5/fit.py b/MxNet/Classification/RN50v1.5/fit.py
@@ -483,11 +483,6 @@ def fit(args, model, data_loader):
     # select gpu for horovod process
     if 'horovod' in args.kv_store:
         args.gpus = [args.gpus[hvd.local_rank()]]
-        ctx = mx.gpu(hvd.local_rank())
-
-        tensor1 = mx.nd.zeros(shape=(1,), dtype='float32', ctx=ctx)
-        tensor2 = mx.nd.zeros(shape=(1,), dtype='float32', ctx=ctx)
-        tensor1, tensor2 = hvd.grouped_allreduce([tensor1,tensor2])
 
     if args.amp:
         amp.init()
@@ -579,6 +574,11 @@ def fit(args, model, data_loader):
         params = model.collect_params()
         if params is not None:
             hvd.broadcast_parameters(params, root_rank=0)
+        ctx = mx.gpu(hvd.local_rank())
+        tensor1 = mx.nd.zeros(shape=(1,), dtype='float32', ctx=ctx)
+        tensor2 = mx.nd.zeros(shape=(1,), dtype='float32', ctx=ctx)
+        tensor1, tensor2 = hvd.grouped_allreduce([tensor1,tensor2])
+        
     global_metrics = CompositeMeter()
     if args.mode in ['train_val', 'train']:
         global_metrics.register_metric('train.loss', MinMeter())
diff --git a/PyTorch/Recommendation/DLRM/dlrm/cuda_ext/fused_gather_embedding.py b/PyTorch/Recommendation/DLRM/dlrm/cuda_ext/fused_gather_embedding.py
@@ -17,7 +17,7 @@
 """
 
 from absl import logging
-from apex import amp
+import torch
 from torch.autograd import Function
 
 from dlrm.cuda_ext import fused_embedding
@@ -26,12 +26,14 @@
 class BuckleEmbeddingFusedGatherFunction(Function):
     """Customized embedding gather """
     @staticmethod
+    @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
     def forward(ctx, embedding, indices, offsets, amp_train):
         output = fused_embedding.gather_gpu_fused_fwd(embedding, indices, offsets, amp_train)
         ctx.save_for_backward(embedding, indices, offsets)
         return output
 
     @staticmethod
+    @torch.cuda.amp.custom_bwd
     def backward(ctx, grad_output):
         embedding, indices, offsets = ctx.saved_tensors
 
@@ -40,4 +42,4 @@ def backward(ctx, grad_output):
         return grad_weights, None, None, None
 
 
-buckle_embedding_fused_gather = amp.float_function(BuckleEmbeddingFusedGatherFunction.apply)
+buckle_embedding_fused_gather = BuckleEmbeddingFusedGatherFunction.apply
diff --git a/PyTorch/Recommendation/DLRM/dlrm/cuda_ext/sparse_embedding.py b/PyTorch/Recommendation/DLRM/dlrm/cuda_ext/sparse_embedding.py
@@ -15,7 +15,7 @@
 import copy
 
 import torch
-from apex import amp
+from torch.cuda import amp
 from dlrm.cuda_ext import sparse_gather
 from torch import nn
 from torch.autograd import Function
@@ -24,18 +24,18 @@
 class EmbeddingGatherFunction(Function):
     """Customized embedding gather with fused plain SGD"""
     @staticmethod
+    @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
     def forward(ctx, embedding, indices):
         output = sparse_gather.gather_gpu_fwd(embedding, indices)
         ctx.save_for_backward(indices)
         ctx.num_features = embedding.size(0)
         return output
 
     @staticmethod
+    @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
     def backward(ctx, grad_output):
         indices = ctx.saved_tensors[0]
-
         grad_embedding = sparse_gather.gather_gpu_bwd(grad_output, indices, ctx.num_features)
-
         return grad_embedding, None
 
 
@@ -66,4 +66,4 @@ def forward(self, categorical_inputs):
         return embedding_out
 
 
-embedding_gather = amp.float_function(EmbeddingGatherFunction.apply)
+embedding_gather = EmbeddingGatherFunction.apply
diff --git a/PyTorch/Recommendation/DLRM/dlrm/scripts/main.py b/PyTorch/Recommendation/DLRM/dlrm/scripts/main.py
@@ -17,7 +17,7 @@
 import os
 import sys
 from absl import app, flags, logging
-from apex import amp, parallel, optimizers as apex_optim
+from apex import optimizers as apex_optim
 
 from dlrm.data.feature_spec import FeatureSpec
 from dlrm.model.distributed import DistributedDlrm
@@ -500,10 +500,7 @@ def parallelize(model):
         if world_size <= 1:
             return model
 
-        if use_gpu:
-            model.top_model = parallel.DistributedDataParallel(model.top_model)
-        else:  # Use other backend for CPU
-            model.top_model = torch.nn.parallel.DistributedDataParallel(model.top_model)
+        model.top_model = torch.nn.parallel.DistributedDataParallel(model.top_model)
         return model
 
     if FLAGS.mode == 'test':
diff --git a/PyTorch/Recommendation/NCF/README.md b/PyTorch/Recommendation/NCF/README.md
@@ -143,23 +143,11 @@ The ability to train deep learning networks with lower precision was introduced
 For information about:
 -   How to train using mixed precision, refer to the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
 -   Techniques used for mixed precision training, refer to the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
--   APEX tools for mixed precision training, refer to the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
 
 
 #### Enabling mixed precision
 
-Using the Automatic Mixed Precision (AMP) package requires two modifications in the source code.
-The first one is to initialize the model and the optimizer using the `amp.initialize` function:
-```python
-model, optimizer = amp.initialize(model, optimizer, opt_level="O2"
-                                          keep_batchnorm_fp32=False, loss_scale='dynamic')
-```
-
-The second one is to use the AMP's loss scaling context manager:
-```python
-with amp.scale_loss(loss, optimizer) as scaled_loss:
-    scaled_loss.backward()
-```
+Mixed precision training is turned off by default. To turn it on issue the `--amp` flag to the `main.py` script.
 
 #### Enabling TF32
 
diff --git a/PyTorch/Recommendation/NCF/ncf.py b/PyTorch/Recommendation/NCF/ncf.py
@@ -47,9 +47,6 @@
 
 import dllogger
 
-from apex.parallel import DistributedDataParallel as DDP
-from apex import amp
-
 
 def synchronized_timestamp():
     torch.cuda.synchronize()
@@ -252,12 +249,8 @@ def main():
     model = model.cuda()
     criterion = criterion.cuda()
 
-    if args.amp:
-        model, optimizer = amp.initialize(model, optimizer, opt_level="O2",
-                                          keep_batchnorm_fp32=False, loss_scale='dynamic')
-
     if args.distributed:
-        model = DDP(model)
+        model = torch.nn.parallel.DistributedDataParallel(model)
 
     local_batch = args.batch_size // args.world_size
     traced_criterion = torch.jit.trace(criterion.forward,
@@ -291,6 +284,7 @@ def main():
     best_epoch = 0
     best_model_timestamp = synchronized_timestamp()
     train_throughputs, eval_throughputs = [], []
+    scaler = torch.cuda.amp.GradScaler(enabled=args.amp)
 
     for epoch in range(args.epochs):
 
@@ -311,16 +305,14 @@ def main():
                 label_features = batch_dict[LABEL_CHANNEL_NAME]
                 label_batch = label_features[label_feature_name]
 
-                outputs = model(user_batch, item_batch)
-                loss = traced_criterion(outputs, label_batch.view(-1, 1)).float()
-                loss = torch.mean(loss.view(-1), 0)
+                with torch.cuda.amp.autocast(enabled=args.amp):
+                    outputs = model(user_batch, item_batch)
+                    loss = traced_criterion(outputs, label_batch.view(-1, 1))
+                    loss = torch.mean(loss.float().view(-1), 0)
 
-                if args.amp:
-                    with amp.scale_loss(loss, optimizer) as scaled_loss:
-                        scaled_loss.backward()
-                else:
-                    loss.backward()
-            optimizer.step()
+                scaler.scale(loss).backward()
+            scaler.step(optimizer)
+            scaler.update()
 
             for p in model.parameters():
                 p.grad = None