Skip to content

Commit 533f744

Browse files
authored
[TXL/PyT] Fixed issue with AMP training together with gradient accumulation (#720)
1 parent 8cbac00 commit 533f744

File tree

5 files changed

+20
-8
lines changed

5 files changed

+20
-8
lines changed

PyTorch/LanguageModeling/Transformer-XL/.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,6 @@
22
__pycache__/
33
data/
44
results/
5+
pytorch/LM-TFM/*
56
*.out
67
*.log
7-
*.json

PyTorch/LanguageModeling/Transformer-XL/pytorch/data_utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def __init__(self, data, bsz, bptt, device='cpu', mem_len=None, ext_len=None, wa
4646
data = data[:n_step * bsz]
4747

4848
# Evenly divide the data across the bsz batches.
49-
self.data = data.view(bsz, -1).t().contiguous()
49+
self.data = data.view(bsz, -1).t().contiguous().pin_memory()
5050

5151
if mem_len and warmup:
5252
self.warmup_batches = (mem_len + bptt - 1) // bptt
@@ -83,8 +83,8 @@ def get_batch(self, i, bptt=None):
8383
end_idx = i + seq_len
8484
beg_idx = max(0, i - self.ext_len)
8585

86-
data = self.data[beg_idx:end_idx].to(self.device)
87-
target = self.data[i+1:i+1+seq_len].to(self.device)
86+
data = self.data[beg_idx:end_idx].to(self.device, non_blocking=True)
87+
target = self.data[i+1:i+1+seq_len].to(self.device, non_blocking=True)
8888

8989
if self.mem_len and self.warmup:
9090
warm = i >= self.warmup_elems

PyTorch/LanguageModeling/Transformer-XL/pytorch/train.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -436,7 +436,7 @@ def evaluate(eval_iter, model, args):
436436

437437

438438
def train_iteration(model, i, mems, data_chunks, target_chunks, scaler,
439-
optimizer, device, args):
439+
optimizer, device, delay_unscale, args):
440440
cpu = torch.device('cpu')
441441
data_i = data_chunks[i].contiguous()
442442
target_i = target_chunks[i].contiguous()
@@ -456,7 +456,7 @@ def train_iteration(model, i, mems, data_chunks, target_chunks, scaler,
456456
if args.amp == 'pytorch':
457457
scaler.scale(loss).backward()
458458
elif args.amp == 'apex':
459-
with amp.scale_loss(loss, optimizer) as scaled_loss:
459+
with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale) as scaled_loss:
460460
scaled_loss.backward()
461461
else:
462462
loss.backward()
@@ -498,12 +498,12 @@ def train(tr_iter, va_iter, model, para_model, model_config, optimizer,
498498
with para_model.no_sync():
499499
train_loss_chunk = train_iteration(
500500
para_model, i, mems, data_chunks, target_chunks, scaler,
501-
optimizer, device, args
501+
optimizer, device, True, args
502502
)
503503
else:
504504
train_loss_chunk = train_iteration(
505505
para_model, i, mems, data_chunks, target_chunks, scaler,
506-
optimizer, device, args
506+
optimizer, device, False, args
507507
)
508508

509509
train_loss += train_loss_chunk

PyTorch/LanguageModeling/Transformer-XL/pytorch/wt103_base.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,12 @@ default:
4444
eval:
4545
<<: *eval
4646

47+
manual_eval:
48+
train:
49+
<<: *train
50+
eval:
51+
<<: *eval
52+
manual_config: '{"n_token": 267735, "n_layer": 16, "n_head": 8, "d_model": 512, "d_head": 64, "d_inner": 2048, "dropout": 0.1, "dropatt": 0.0, "dtype": null, "tie_weight": true, "d_embed": 512, "div_val": 1, "tie_projs": [false, true, true, true], "pre_lnorm": false, "tgt_len": 192, "ext_len": 0, "mem_len": 192, "cutoffs": [19997, 39997, 199997], "same_length": false, "attn_type": 0, "clamp_len": -1, "sample_softmax": -1}'
4753

4854
# Full training configs for NVIDIA DGX-1 (8x NVIDIA V100 16GB GPU)
4955
dgx1_8gpu_fp16: &dgx1_8gpu_fp16

PyTorch/LanguageModeling/Transformer-XL/pytorch/wt103_large.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,12 @@ default:
5555
eval:
5656
<<: *eval
5757

58+
manual_eval:
59+
train:
60+
<<: *train
61+
eval:
62+
<<: *eval
63+
manual_config: '{"n_token": 267735, "n_layer": 18, "n_head": 16, "d_model": 1024, "d_head": 64, "d_inner": 4096, "dropout": 0.2, "dropatt": 0.2, "dtype": null, "tie_weight": true, "d_embed": 1024, "div_val": 4, "tie_projs": [false, true, true, true], "pre_lnorm": false, "tgt_len": 384, "ext_len": 0, "mem_len": 384, "cutoffs": [19997, 39997, 199997], "same_length": false, "attn_type": 0, "clamp_len": -1, "sample_softmax": -1}'
5864

5965
# Full training configs for NVIDIA DGX-1 (8x NVIDIA V100 16GB GPU)
6066
dgx1_8gpu_fp16: &dgx1_8gpu_fp16

0 commit comments

Comments
 (0)