Data-Science-in-Mechanical-Engineering
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/Bert.py‎
Lines changed: 80 additions & 36 deletions b/‎examples/Bert.py‎
Lines changed: 80 additions & 36 deletions
@@ -61,7 +61,7 @@ These functions are just for your information. They are internally used, however
 
 ### Gradient Computation
 `mpx` provides function decorators for gradient calculations that summarize steps 3--9 in one function call. They have the same meaning and syntax as the corresponding decorators of `equinox`. This means, for an existing training pipeline, one can replace the calls of `equinox.filter_grad/filter_value_and_grad` with `mpx.filter_grad/filter_value_and_grad`
-- `filter_grad(func, scaling: loss_scaling.DynamicLossScaling, has_aux=False, use_mixed_precision=True)`: Transformation that computes the gradient of func with respect to its first argument using mixed precision with scaling, similar to `equinox.filter_grad`. The decorator works as follows:
+- `filter_grad(func, scaling: loss_scaling.DynamicLossScaling, has_aux=False, use_mixed_precision=True)`: Transformation that computes the gradient of func with respect to its first argument using mixed precision with scaling, similar to `equinox.filter_grad`. The transformed function then works as follows:
   1. If `use_mixed_precision` is True:
      - Casts all input arguments to half precision (float16/bfloat16)
      - Scales the function's output by `scaling`
 
@@ -16,8 +16,12 @@
 from tqdm import notebook as tqdm  # https://github.com/tqdm/tqdm
 from transformers import AutoTokenizer  # https://github.com/huggingface/transformers
 
+import einshape as es
+
 from examples.transformer import TransformerLayer
 
+import mpx
+
 class EmbedderBlock(eqx.Module):
     """BERT embedder."""
 
@@ -59,7 +63,7 @@ def __call__(
         position_ids: Array,
         segment_ids: Array,
         enable_dropout: bool = False,
-        key: jax.random.PRNGKey | None = None,
+        key: jax.random.PRNGKey = None,
     ) -> Array:
         tokens = jax.vmap(self.token_embedder)(token_ids)
         segments = jax.vmap(self.segment_embedder)(segment_ids)
@@ -129,7 +133,7 @@ def __call__(
         segment_ids: Array,
         *,
         enable_dropout: bool = False,
-        key: jax.random.PRNGKey | None = None,
+        key: jax.random.PRNGKey = None,
     ) -> dict[str, Array]:
         emb_key, l_key = (None, None) if key is None else jax.random.split(key)
 
@@ -216,21 +220,20 @@ def compute_loss(classifier, inputs, key):
     batch_size = inputs["token_ids"].shape[0]
     batched_keys = jax.random.split(key, num=batch_size)
     logits = jax.vmap(classifier, in_axes=(0, None, 0))(inputs, True, batched_keys)
-    return jnp.mean(
-        optax.softmax_cross_entropy_with_integer_labels(
+    # all of these operations are done in full precision
+    return mpx.force_full_precision(jnp.mean)(
+        mpx.force_full_precision(optax.softmax_cross_entropy_with_integer_labels)(
             logits=logits, labels=inputs["label"]
         )
     )
 
 
-def make_step(model, inputs, opt_state, key, tx):
+def make_step(model, inputs, opt_state, key, tx, scaling: mpx.DynamicLossScaling):
     key, new_key = jax.random.split(key)
-    loss, grads = compute_loss(model, inputs, key)
-    grads = jax.lax.pmean(grads, axis_name="devices")
-
-    updates, opt_state = tx.update(grads, opt_state, model)
-    model = eqx.apply_updates(model, updates)
-    return loss, model, opt_state, new_key
+    loss, scaling, grads_finite, grads = mpx.filter_value_and_grad(compute_loss, scaling)(model, inputs, key)
+    
+    model, opt_state = mpx.optimizer_update(model, tx, opt_state, grads, grads_finite)
+    return loss, model, opt_state, new_key, scaling
 
 
 def make_eval_step(model, inputs):
@@ -239,6 +242,7 @@ def make_eval_step(model, inputs):
 if __name__ == "__main__":
     # Tiny-BERT config.
     bert_config = {
+        "train_mixed_precision": True,
         "vocab_size": 30522,
         "hidden_size": 128,
         "num_hidden_layers": 2,
@@ -271,36 +275,76 @@ def tokenize(example):
     batch_size = 32
     learning_rate = 1e-5
 
+    ############################
+    # init model
+    ############################
+    model = BertClassifier(config=bert_config, num_classes=2, key=model_key)
+
+    ############################
+    # init optimizer
+    ############################
+    tx = optax.adam(learning_rate=learning_rate)
+    tx = optax.chain(optax.clip_by_global_norm(1.0), tx)
+    opt_state = tx.init(model)
+
+    ############################
+    # init scaling
+    ############################
+    if bert_config["train_mixed_precision"]:
+        loss_scaling = mpx.DynamicLossScaling(loss_scaling=jnp.ones((1,), dtype=jnp.float32) * int((2 - 2**(-10)) * 2**15), 
+                                          min_loss_scaling=jnp.ones((1,), dtype=jnp.float32) * 1.0)
+    else:
+        loss_scaling = None
+
+    ############################
+    # training 
+    ############################
     for epoch in range(epochs):
-    with tqdm.tqdm(
-        ds["train"].iter(batch_size=batch_size, drop_last_batch=True),
-        total=ds["train"].num_rows // batch_size,
-        unit="steps",
-        desc=f"Epoch {epoch+1}/{epochs}",
-    ) as tqdm_epoch:
-        for batch in tqdm_epoch:
+        with tqdm.tqdm(
+            ds["train"].iter(batch_size=batch_size, drop_last_batch=True),
+            total=ds["train"].num_rows // batch_size,
+            unit="steps",
+            desc=f"Epoch {epoch+1}/{epochs}",
+        ) as tqdm_epoch:
+        
+            for batch in tqdm_epoch:
+                token_ids, token_type_ids = batch["input_ids"], batch["token_type_ids"]
+                label = batch["label"]
+
+                # swap time and feature axis.
+                token_ids = es.jax_einshape("bhn->bnh", token_ids)
+                token_type_ids = es.jax_einshape("bhn->bnh", token_type_ids)
+
+                inputs = {
+                    "token_ids": token_ids,
+                    "segment_ids": token_type_ids,
+                    "label": label,
+                }
+                loss, model, opt_state, train_key, loss_scaling = make_step(
+                    model, inputs, opt_state, train_key, tx, scaling=loss_scaling
+                )
+
+                tqdm_epoch.set_postfix(loss=np.sum(loss).item())
+
+        outputs = []
+        for batch in tqdm.tqdm(
+            ds["validation"].iter(batch_size=batch_size),
+            unit="steps",
+            total=np.ceil(ds["validation"].num_rows / batch_size),
+            desc="Validation",
+        ):
             token_ids, token_type_ids = batch["input_ids"], batch["token_type_ids"]
             label = batch["label"]
 
-            # Split batch across devices.
-            token_ids = einops.rearrange(
-                token_ids, "(b1 b2) s -> b1 b2 s", b1=num_devices
-            )
-            token_type_ids = einops.rearrange(
-                token_type_ids, "(b1 b2) s -> b1 b2 s", b1=num_devices
-            )
-            label = einops.rearrange(label, "(b1 b2) -> b1 b2", b1=num_devices)
-
-            inputs = {
-                "token_ids": token_ids,
-                "segment_ids": token_type_ids,
-                "label": label,
-            }
-            loss, model, opt_state, train_key = p_make_step(
-                model, inputs, opt_state, train_key
-            )
 
-            tqdm_epoch.set_postfix(loss=np.sum(loss).item())
+            inputs = {"token_ids": token_ids, "segment_ids": token_type_ids}
+
+            # Compare predicted class with label.
+            output = make_eval_step(model, inputs)
+            output = map(float, np.argmax(output.reshape(-1, 2), axis=-1) == label)
+            outputs.extend(output)
+
+        print(f"Accuracy: {100 * np.sum(outputs) / len(outputs):.2f}%")