added some more explanations

AlexGraefe · AlexGraefe · commit 89075c836824 · 2025-05-18T16:12:36.000+02:00
diff --git a/README.md b/README.md
@@ -86,7 +86,19 @@ The gradient transformations might return gradients that are infinite. In this c
 The following provides a small example, training a vision transformer on Cifar100 presenting all the important features of `mpx`. For details, please visit examples/train_vit.py.
 This example will not go into the details for the neural network part, but just the `mpx` relevant parts.
 
-When loading the datasets, instantiating the models etc., you must instantiate the loss scaling. Typically, the initial value is set to the maximum value of `float16`.
+### Installation and Execution of the Example
+First install JAX for your hardware.
+Then, install all dependencies via
+```bash
+pip install -r examples/requirements.txt
+```
+Then you can run the example via. ATTENTION: The script downloads Cifar100.
+```bash
+python -m examples.train_vit
+```
+
+### Explanation
+The loss scaling has to be initialized during the instantiation of the datasets, models etc. Typically, the initial value is set to the maximum value of `float16`.
 
 ```python
 
diff --git a/examples/requirements.txt b/examples/requirements.txt
@@ -25,10 +25,6 @@ idna==3.10
 immutabledict==4.2.1
 importlib_resources==6.5.2
 iniconfig==2.1.0
-jax==0.6.0
-jax-cuda12-pjrt==0.6.0
-jax-cuda12-plugin==0.6.0
-jaxlib==0.6.0
 jaxtyping==0.3.2
 keras==3.9.2
 kiwisolver==1.4.8
@@ -41,16 +37,6 @@ mdurl==0.1.2
 ml_dtypes==0.5.1
 namex==0.0.9
 numpy==2.1.3
-nvidia-cublas-cu12==12.9.0.13
-nvidia-cuda-cupti-cu12==12.9.19
-nvidia-cuda-nvcc-cu12==12.9.41
-nvidia-cuda-runtime-cu12==12.9.37
-nvidia-cudnn-cu12==9.10.1.4
-nvidia-cufft-cu12==11.4.0.6
-nvidia-cusolver-cu12==11.7.4.40
-nvidia-cusparse-cu12==12.5.9.5
-nvidia-nccl-cu12==2.26.5
-nvidia-nvjitlink-cu12==12.9.41
 opt_einsum==3.4.0
 optax==0.2.4
 optree==0.15.0
diff --git a/examples/train_vit.py b/examples/train_vit.py
@@ -188,8 +188,9 @@ def init_tf_dataloader_image(data_source, batch_size, num_epochs, seed, resoluti
         data = data.as_numpy_iterator()
         return data
 
-    train_dataset = init_tf_dataloader_image(train_data_source, config["batch_size"], config["num_epochs"], 0, 32)
-    val_dataset = init_tf_dataloader_image(val_data_source, config["batch_size"], config["num_epochs"], 0, 32)
+    # we make the resolution way too high for CIFAR100, but this is just for testing and to force the training to use a lot of memory.
+    train_dataset = init_tf_dataloader_image(train_data_source, config["batch_size"], config["num_epochs"], 0, 224)
+    val_dataset = init_tf_dataloader_image(val_data_source, config["batch_size"], config["num_epochs"], 0, 224)
 
     #########################################
     # Sharding
@@ -226,7 +227,7 @@ def init_tf_dataloader_image(data_source, batch_size, num_epochs, seed, resoluti
     # Load optimizer
     ########################################
     # optimizer strategy from https://arxiv.org/abs/2106.10270
-    duration_linear_schedule = 1000
+    duration_linear_schedule = 100
     linear_schedule = optax.linear_schedule(
         init_value=config["learning_rate"] * 0.01,
         end_value=config["learning_rate"],
@@ -322,14 +323,13 @@ def init_tf_dataloader_image(data_source, batch_size, num_epochs, seed, resoluti
 if __name__ == "__main__":
     config = {
         "train_mixed_precision": True,
-        "batch_size": 512,
+        "batch_size": 256,  
         "num_epochs": 10,
-        "num_features": 128,
+        "num_features": 256,
         "num_heads": 4,
-        "num_features_residual": 256,
+        "num_features_residual": 800,
         "num_transformer_blocks": 12,
         "learning_rate": 0.001,
-        "batch_size": 128,
         "weight_regularization": 0.001,
     }
     main(config)