Skip to content

Commit 6796d69

Browse files
authored
Merge branch 'NVIDIA:master' into fix_nnunet_lowres_axis
2 parents 507351d + 54e2fb4 commit 6796d69

File tree

7 files changed

+26
-47
lines changed

7 files changed

+26
-47
lines changed

MxNet/Classification/RN50v1.5/dali.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def add_dali_args(parser):
3131
group.add_argument('--dali-validation-threads', type=int, default=10, help="number of threads" +\
3232
"per GPU for DALI for validation")
3333
group.add_argument('--dali-prefetch-queue', type=int, default=5, help="DALI prefetch queue depth")
34-
group.add_argument('--dali-nvjpeg-memory-padding', type=int, default=256, help="Memory padding value for nvJPEG (in MB)")
34+
group.add_argument('--dali-nvjpeg-memory-padding', type=int, default=64, help="Memory padding value for nvJPEG (in MB)")
3535
group.add_argument('--dali-fuse-decoder', type=int, default=1, help="0 or 1 whether to fuse decoder or not")
3636

3737
group.add_argument('--dali-nvjpeg-width-hint', type=int, default=5980, help="Width hint value for nvJPEG (in pixels)")

MxNet/Classification/RN50v1.5/fit.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -483,11 +483,6 @@ def fit(args, model, data_loader):
483483
# select gpu for horovod process
484484
if 'horovod' in args.kv_store:
485485
args.gpus = [args.gpus[hvd.local_rank()]]
486-
ctx = mx.gpu(hvd.local_rank())
487-
488-
tensor1 = mx.nd.zeros(shape=(1,), dtype='float32', ctx=ctx)
489-
tensor2 = mx.nd.zeros(shape=(1,), dtype='float32', ctx=ctx)
490-
tensor1, tensor2 = hvd.grouped_allreduce([tensor1,tensor2])
491486

492487
if args.amp:
493488
amp.init()
@@ -579,6 +574,11 @@ def fit(args, model, data_loader):
579574
params = model.collect_params()
580575
if params is not None:
581576
hvd.broadcast_parameters(params, root_rank=0)
577+
ctx = mx.gpu(hvd.local_rank())
578+
tensor1 = mx.nd.zeros(shape=(1,), dtype='float32', ctx=ctx)
579+
tensor2 = mx.nd.zeros(shape=(1,), dtype='float32', ctx=ctx)
580+
tensor1, tensor2 = hvd.grouped_allreduce([tensor1,tensor2])
581+
582582
global_metrics = CompositeMeter()
583583
if args.mode in ['train_val', 'train']:
584584
global_metrics.register_metric('train.loss', MinMeter())

PyTorch/Recommendation/DLRM/dlrm/cuda_ext/fused_gather_embedding.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
"""
1818

1919
from absl import logging
20-
from apex import amp
20+
import torch
2121
from torch.autograd import Function
2222

2323
from dlrm.cuda_ext import fused_embedding
@@ -26,12 +26,14 @@
2626
class BuckleEmbeddingFusedGatherFunction(Function):
2727
"""Customized embedding gather """
2828
@staticmethod
29+
@torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
2930
def forward(ctx, embedding, indices, offsets, amp_train):
3031
output = fused_embedding.gather_gpu_fused_fwd(embedding, indices, offsets, amp_train)
3132
ctx.save_for_backward(embedding, indices, offsets)
3233
return output
3334

3435
@staticmethod
36+
@torch.cuda.amp.custom_bwd
3537
def backward(ctx, grad_output):
3638
embedding, indices, offsets = ctx.saved_tensors
3739

@@ -40,4 +42,4 @@ def backward(ctx, grad_output):
4042
return grad_weights, None, None, None
4143

4244

43-
buckle_embedding_fused_gather = amp.float_function(BuckleEmbeddingFusedGatherFunction.apply)
45+
buckle_embedding_fused_gather = BuckleEmbeddingFusedGatherFunction.apply

PyTorch/Recommendation/DLRM/dlrm/cuda_ext/sparse_embedding.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import copy
1616

1717
import torch
18-
from apex import amp
18+
from torch.cuda import amp
1919
from dlrm.cuda_ext import sparse_gather
2020
from torch import nn
2121
from torch.autograd import Function
@@ -24,18 +24,18 @@
2424
class EmbeddingGatherFunction(Function):
2525
"""Customized embedding gather with fused plain SGD"""
2626
@staticmethod
27+
@torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
2728
def forward(ctx, embedding, indices):
2829
output = sparse_gather.gather_gpu_fwd(embedding, indices)
2930
ctx.save_for_backward(indices)
3031
ctx.num_features = embedding.size(0)
3132
return output
3233

3334
@staticmethod
35+
@torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
3436
def backward(ctx, grad_output):
3537
indices = ctx.saved_tensors[0]
36-
3738
grad_embedding = sparse_gather.gather_gpu_bwd(grad_output, indices, ctx.num_features)
38-
3939
return grad_embedding, None
4040

4141

@@ -66,4 +66,4 @@ def forward(self, categorical_inputs):
6666
return embedding_out
6767

6868

69-
embedding_gather = amp.float_function(EmbeddingGatherFunction.apply)
69+
embedding_gather = EmbeddingGatherFunction.apply

PyTorch/Recommendation/DLRM/dlrm/scripts/main.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
import os
1818
import sys
1919
from absl import app, flags, logging
20-
from apex import amp, parallel, optimizers as apex_optim
20+
from apex import optimizers as apex_optim
2121

2222
from dlrm.data.feature_spec import FeatureSpec
2323
from dlrm.model.distributed import DistributedDlrm
@@ -500,10 +500,7 @@ def parallelize(model):
500500
if world_size <= 1:
501501
return model
502502

503-
if use_gpu:
504-
model.top_model = parallel.DistributedDataParallel(model.top_model)
505-
else: # Use other backend for CPU
506-
model.top_model = torch.nn.parallel.DistributedDataParallel(model.top_model)
503+
model.top_model = torch.nn.parallel.DistributedDataParallel(model.top_model)
507504
return model
508505

509506
if FLAGS.mode == 'test':

PyTorch/Recommendation/NCF/README.md

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -143,23 +143,11 @@ The ability to train deep learning networks with lower precision was introduced
143143
For information about:
144144
- How to train using mixed precision, refer to the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html) documentation.
145145
- Techniques used for mixed precision training, refer to the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
146-
- APEX tools for mixed precision training, refer to the [NVIDIA Apex: Tools for Easy Mixed-Precision Training in PyTorch](https://devblogs.nvidia.com/apex-pytorch-easy-mixed-precision-training/).
147146

148147

149148
#### Enabling mixed precision
150149

151-
Using the Automatic Mixed Precision (AMP) package requires two modifications in the source code.
152-
The first one is to initialize the model and the optimizer using the `amp.initialize` function:
153-
```python
154-
model, optimizer = amp.initialize(model, optimizer, opt_level="O2"
155-
keep_batchnorm_fp32=False, loss_scale='dynamic')
156-
```
157-
158-
The second one is to use the AMP's loss scaling context manager:
159-
```python
160-
with amp.scale_loss(loss, optimizer) as scaled_loss:
161-
scaled_loss.backward()
162-
```
150+
Mixed precision training is turned off by default. To turn it on issue the `--amp` flag to the `main.py` script.
163151

164152
#### Enabling TF32
165153

PyTorch/Recommendation/NCF/ncf.py

Lines changed: 9 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,6 @@
4747

4848
import dllogger
4949

50-
from apex.parallel import DistributedDataParallel as DDP
51-
from apex import amp
52-
5350

5451
def synchronized_timestamp():
5552
torch.cuda.synchronize()
@@ -252,12 +249,8 @@ def main():
252249
model = model.cuda()
253250
criterion = criterion.cuda()
254251

255-
if args.amp:
256-
model, optimizer = amp.initialize(model, optimizer, opt_level="O2",
257-
keep_batchnorm_fp32=False, loss_scale='dynamic')
258-
259252
if args.distributed:
260-
model = DDP(model)
253+
model = torch.nn.parallel.DistributedDataParallel(model)
261254

262255
local_batch = args.batch_size // args.world_size
263256
traced_criterion = torch.jit.trace(criterion.forward,
@@ -291,6 +284,7 @@ def main():
291284
best_epoch = 0
292285
best_model_timestamp = synchronized_timestamp()
293286
train_throughputs, eval_throughputs = [], []
287+
scaler = torch.cuda.amp.GradScaler(enabled=args.amp)
294288

295289
for epoch in range(args.epochs):
296290

@@ -311,16 +305,14 @@ def main():
311305
label_features = batch_dict[LABEL_CHANNEL_NAME]
312306
label_batch = label_features[label_feature_name]
313307

314-
outputs = model(user_batch, item_batch)
315-
loss = traced_criterion(outputs, label_batch.view(-1, 1)).float()
316-
loss = torch.mean(loss.view(-1), 0)
308+
with torch.cuda.amp.autocast(enabled=args.amp):
309+
outputs = model(user_batch, item_batch)
310+
loss = traced_criterion(outputs, label_batch.view(-1, 1))
311+
loss = torch.mean(loss.float().view(-1), 0)
317312

318-
if args.amp:
319-
with amp.scale_loss(loss, optimizer) as scaled_loss:
320-
scaled_loss.backward()
321-
else:
322-
loss.backward()
323-
optimizer.step()
313+
scaler.scale(loss).backward()
314+
scaler.step(optimizer)
315+
scaler.update()
324316

325317
for p in model.parameters():
326318
p.grad = None

0 commit comments

Comments
 (0)