WMT mixed-precision support

rka97 · rka97 · commit a4f9948db6a8 · 2025-12-11T17:58:17.000Z
diff --git a/algoperf/workloads/wmt/wmt_jax/models.py b/algoperf/workloads/wmt/wmt_jax/models.py
@@ -364,6 +364,7 @@ def __call__(
       input_embed = nn.Embed(
         num_embeddings=cfg.vocab_size,
         features=cfg.emb_dim,
+        dtype=cfg.dtype,
         embedding_init=nn.initializers.normal(stddev=1.0),
       )
     else:
@@ -437,6 +438,7 @@ def __call__(
       output_embed = nn.Embed(
         num_embeddings=cfg.vocab_size,
         features=cfg.emb_dim,
+        dtype=cfg.dtype,
         embedding_init=nn.initializers.normal(stddev=1.0),
       )
     else:
@@ -497,6 +499,7 @@ def setup(self):
       self.shared_embedding = nn.Embed(
         num_embeddings=cfg.vocab_size,
         features=cfg.emb_dim,
+        dtype=cfg.dtype,
         embedding_init=nn.initializers.normal(stddev=1.0),
       )
     else:
diff --git a/algoperf/workloads/wmt/wmt_jax/workload.py b/algoperf/workloads/wmt/wmt_jax/workload.py
@@ -6,6 +6,7 @@
 
 import jax
 import jax.numpy as jnp
+import jmp
 import numpy as np
 import optax
 from absl import logging
@@ -27,6 +28,17 @@ def _to_host(x: spec.Tensor) -> spec.Tensor:
 class WmtWorkload(BaseWmtWorkload):
   """WMT Jax workload."""
 
+  def __init__(self) -> None:
+    super().__init__()
+    compute_dtype = spec.JAX_DTYPE_MAP[self._compute_dtype]
+    param_dtype = spec.JAX_DTYPE_MAP[self._param_dtype]
+    output_dtype = compute_dtype
+    self._mp_policy = jmp.Policy(
+      compute_dtype=compute_dtype,
+      param_dtype=param_dtype,
+      output_dtype=output_dtype,
+    )
+
   def compute_weighted_cross_entropy(
     self,
     logits: spec.Tensor,
@@ -251,11 +263,13 @@ def init_model_fn(self, rng: spec.RandomState) -> spec.ModelInitState:
     else:
       raise ValueError(f'Unknown activation function {self.activation}.')
 
+    param_dtype = spec.JAX_DTYPE_MAP[self._param_dtype]
     model_config = models.TransformerConfig(
       pre_ln=self.pre_ln,
       attention_temp=self.attention_temp,
       activation=activation,
       glu=self.glu,
+      dtype=param_dtype,
     )
     self._train_model = models.Transformer(model_config)
     eval_config = replace(model_config, deterministic=True)
@@ -313,6 +327,9 @@ def model_fn(
     else:
       model = self._eval_model
 
+    # Cast params to compute dtype
+    params = self._mp_policy.cast_to_compute(params)
+
     logits_batch = model.apply(
       {'params': params},
       inputs,
@@ -324,6 +341,8 @@ def model_fn(
       rngs={'dropout': rng},
       dropout_rate=dropout_rate,
     )
+    # Cast logits to output dtype
+    logits_batch = self._mp_policy.cast_to_output(logits_batch)
     return logits_batch, None
 
   def _build_input_queue(
diff --git a/algoperf/workloads/wmt/wmt_pytorch/models.py b/algoperf/workloads/wmt/wmt_pytorch/models.py
@@ -116,10 +116,11 @@ def __init__(
     layer_norm_eps: float = 1e-6,
     attention_temp: float = 1.0,
     pre_ln: bool = True,
+    dtype: torch.dtype = torch.float32,
   ):
     super().__init__()
-    self.pos_encoder = PositionalEncoding(d_model)
-    self.shared_embedding = nn.Embedding(ntoken, d_model)
+    self.pos_encoder = PositionalEncoding(d_model, dtype=dtype)
+    self.shared_embedding = nn.Embedding(ntoken, d_model, dtype=dtype)
     self.encoder = Encoder(
       d_model,
       nhead,
@@ -130,6 +131,7 @@ def __init__(
       layer_norm_eps,
       attention_temp,
       pre_ln,
+      dtype=dtype,
     )
     self.decoder = Decoder(
       d_model,
@@ -141,6 +143,7 @@ def __init__(
       layer_norm_eps,
       attention_temp,
       pre_ln,
+      dtype=dtype,
     )
     # Share positional encoding and embedding between encoder and decoder.
     self.encoder.pos_encoder = self.pos_encoder
@@ -287,6 +290,7 @@ def __init__(
     layer_norm_eps: float = 1e-6,
     attention_temp: float = 1.0,
     pre_ln: bool = True,
+    dtype: torch.dtype = torch.float32,
   ):
     super().__init__()
     self.nhead = nhead
@@ -301,8 +305,11 @@ def __init__(
       layer_norm_eps=layer_norm_eps,
       attention_temp=attention_temp,
       pre_ln=pre_ln,
+      dtype=dtype,
+    )
+    encoder_norm = (
+      nn.LayerNorm(d_model, eps=layer_norm_eps, dtype=dtype) if pre_ln else None
     )
-    encoder_norm = nn.LayerNorm(d_model, eps=layer_norm_eps) if pre_ln else None
     self.encoder = TransformerEncoder(encoder_layer, nlayers, encoder_norm)
 
   def forward(
@@ -332,6 +339,7 @@ def __init__(
     layer_norm_eps: float = 1e-6,
     attention_temp: float = 1.0,
     pre_ln: bool = True,
+    dtype: torch.dtype = torch.float32,
   ):
     super().__init__()
     self.nhead = nhead
@@ -347,6 +355,7 @@ def __init__(
       nlayers,
       attention_temp,
       pre_ln,
+      dtype=dtype,
     )
 
   def forward(
@@ -398,13 +407,18 @@ def forward(
 
 
 class PositionalEncoding(nn.Module):
-  def __init__(self, d_model: int, max_len: int = 256):
+  def __init__(
+    self,
+    d_model: int,
+    max_len: int = 256,
+    dtype: torch.dtype = torch.float32,
+  ):
     super().__init__()
 
     position = torch.arange(max_len).unsqueeze(1)
     scale_factor = -math.log(10000.0) / (d_model // 2 - 1)
     div_term = torch.exp(torch.arange(d_model // 2) * scale_factor)
-    pe = torch.zeros(1, max_len, d_model)
+    pe = torch.zeros(1, max_len, d_model, dtype=dtype)
     pe[0, :, : d_model // 2] = torch.sin(position * div_term)
     pe[0, :, d_model // 2 : 2 * (d_model // 2)] = torch.cos(position * div_term)
     self.register_buffer('pe', pe)
@@ -599,6 +613,7 @@ def __init__(
     num_layers,
     attention_temp,
     pre_ln,
+    dtype: torch.dtype = torch.float32,
   ):
     super().__init__()
     self.layers = nn.ModuleList(
@@ -612,12 +627,15 @@ def __init__(
           layer_norm_eps=layer_norm_eps,
           attention_temp=attention_temp,
           pre_ln=pre_ln,
+          dtype=dtype,
         )
         for _ in range(num_layers)
       ]
     )
     self.num_layers = num_layers
-    self.norm = nn.LayerNorm(d_model, eps=layer_norm_eps) if pre_ln else None
+    self.norm = (
+      nn.LayerNorm(d_model, eps=layer_norm_eps, dtype=dtype) if pre_ln else None
+    )
 
   def forward(
     self,
diff --git a/algoperf/workloads/wmt/wmt_pytorch/workload.py b/algoperf/workloads/wmt/wmt_pytorch/workload.py
@@ -24,6 +24,11 @@
 class WmtWorkload(BaseWmtWorkload):
   """WMT PyTorch workload."""
 
+  def __init__(self) -> None:
+    super().__init__()
+    self._param_dtype_pt = spec.PYTORCH_DTYPE_MAP[self._param_dtype]
+    self._compute_dtype_pt = spec.PYTORCH_DTYPE_MAP[self._compute_dtype]
+
   def compute_weighted_cross_entropy(
     self,
     logits: spec.Tensor,
@@ -189,6 +194,7 @@ def init_model_fn(self, rng: spec.RandomState) -> spec.ModelInitState:
       attention_temp=self.attention_temp,
       activation=activation,
       glu=self.glu,
+      dtype=self._param_dtype_pt,
     )
     self._param_shapes = param_utils.pytorch_param_shapes(model)
     self._param_types = param_utils.pytorch_param_types(self._param_shapes)
@@ -228,23 +234,24 @@ def model_fn(
     }
 
     with contexts[mode]():
-      logits_batch = model(
-        src=augmented_and_preprocessed_input_batch['inputs'],
-        tgt=augmented_and_preprocessed_input_batch['targets'],
-        inputs_positions=augmented_and_preprocessed_input_batch.get(
-          'inputs_position', None
-        ),
-        targets_positions=augmented_and_preprocessed_input_batch.get(
-          'targets_position', None
-        ),
-        inputs_segmentation=augmented_and_preprocessed_input_batch.get(
-          'inputs_segmentation', None
-        ),
-        targets_segmentation=augmented_and_preprocessed_input_batch.get(
-          'targets_segmentation', None
-        ),
-        dropout_rate=dropout_rate,
-      )
+      with torch.autocast(device_type='cuda', dtype=self._compute_dtype_pt):
+        logits_batch = model(
+          src=augmented_and_preprocessed_input_batch['inputs'],
+          tgt=augmented_and_preprocessed_input_batch['targets'],
+          inputs_positions=augmented_and_preprocessed_input_batch.get(
+            'inputs_position', None
+          ),
+          targets_positions=augmented_and_preprocessed_input_batch.get(
+            'targets_position', None
+          ),
+          inputs_segmentation=augmented_and_preprocessed_input_batch.get(
+            'inputs_segmentation', None
+          ),
+          targets_segmentation=augmented_and_preprocessed_input_batch.get(
+            'targets_segmentation', None
+          ),
+          dropout_rate=dropout_rate,
+        )
 
     return logits_batch, None
 
diff --git a/algoperf/workloads/wmt/workload.py b/algoperf/workloads/wmt/workload.py
@@ -22,6 +22,8 @@ class BaseWmtWorkload(spec.Workload):
   """A WMT workload."""
 
   _vocab_size: int = 32000
+  _compute_dtype: spec.DTYPE = spec.DTYPE.BFLOAT16
+  _param_dtype: spec.DTYPE = spec.DTYPE.FLOAT32
 
   def __init__(self) -> None:
     super().__init__()