Add AdamW optimizer support for World Language Model example (#1380)

likejazz · web-flow · commit 28d16ffaa507 · 2025-08-22T23:08:25.000-07:00
diff --git a/word_language_model/README.md b/word_language_model/README.md
@@ -8,9 +8,11 @@ python main.py --accel --epochs 6           # Train a LSTM on Wikitext-2.
 python main.py --accel --epochs 6 --tied    # Train a tied LSTM on Wikitext-2.
 python main.py --accel --tied               # Train a tied LSTM on Wikitext-2for 40 epochs.
 python main.py --accel --epochs 6 --model Transformer --lr 5
-                                           # Train a Transformer model on Wikitext-2.
+                                            # Train a Transformer model on Wikitext-2.
+python main.py --accel --epochs 6 --model Transformer --use-optimizer --lr 0.001
+                                            # Train a Transformer model with AdamW optimizer on Wikitext-2.
 
-python generate.py --accel                       # Generate samples from the default model checkpoint.
+python generate.py --accel                  # Generate samples from the default model checkpoint.
 ```
 
 > [!NOTE] 
@@ -45,6 +47,7 @@ optional arguments:
                         path to export the final model in onnx format
   --nhead NHEAD         the number of heads in the encoder/decoder of the transformer model
   --dry-run             verify the code and the model
+  --use-optimizer       specify whether to use an AdamW optimizer
 ```
 
 With these arguments, a variety of models can be tested.
diff --git a/word_language_model/main.py b/word_language_model/main.py
@@ -47,7 +47,10 @@
                     help='the number of heads in the encoder/decoder of the transformer model')
 parser.add_argument('--dry-run', action='store_true',
                     help='verify the code and the model')
-parser.add_argument('--accel', action='store_true',help='Enables accelerated training')
+parser.add_argument('--accel', action='store_true',
+                    help='Enables accelerated training')
+parser.add_argument('--use-optimizer', action='store_true',
+                    help='Uses AdamW optimizer for gradient updating')
 args = parser.parse_args()
 
 # Set the random seed manually for reproducibility.
@@ -104,6 +107,8 @@ def batchify(data, bsz):
     model = RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied).to(device)
 
 criterion = nn.NLLLoss()
+if args.use_optimizer:
+    optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr)
 
 ###############################################################################
 # Training code
@@ -167,7 +172,10 @@ def train():
         data, targets = get_batch(train_data, i)
         # Starting each batch, we detach the hidden state from how it was previously produced.
         # If we didn't, the model would try backpropagating all the way to start of the dataset.
-        model.zero_grad()
+        if args.use_optimizer:
+            optimizer.zero_grad()
+        else:
+            model.zero_grad()
         if args.model == 'Transformer':
             output = model(data)
             output = output.view(-1, ntokens)
@@ -179,8 +187,11 @@ def train():
 
         # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
         torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
-        for p in model.parameters():
-            p.data.add_(p.grad, alpha=-lr)
+        if args.use_optimizer:
+            optimizer.step()
+        else:
+            for p in model.parameters():
+                p.data.add_(p.grad, alpha=-lr)
 
         total_loss += loss.item()