Update torchtitan and train.py (#21)

kashif · yzhangcs · web-flow · commit 7f85833debc1 · 2025-04-05T22:11:27.000+08:00
* activations on CUDA offloaded

* add save_for_all_ranks config

* update torchtitan

* update train.py

* use build_loss_fn

* add get_nparams_and_flops

* remove unused import

* Fix isort issues

---------

Co-authored-by: Yu Zhang &lt;yzhang.cs@outlook.com&gt;
diff --git a/3rdparty/flash-linear-attention b/3rdparty/flash-linear-attention
@@ -1 +1 @@
-Subproject commit a5199200efdce7eae142498f137ea77219191a21
+Subproject commit 59f9e42dd50d9a8834de79737ac2edaaf796b5d1
diff --git a/3rdparty/torchtitan b/3rdparty/torchtitan
@@ -1 +1 @@
-Subproject commit 0da8901b1918fc21b5d43bbc693a120fc4281378
+Subproject commit d8fc8aac22888d3e3227cf4bca085bcfbc146784
diff --git a/flame/config_manager.py b/flame/config_manager.py
@@ -466,15 +466,15 @@ def __init__(self):
             default="tb",
             help="Folder to dump TensorBoard states",
         )
-        # TODO: store_true & default=True make impossible for cmd to set it to False
         self.parser.add_argument(
-            "--metrics.rank_0_only",
+            "--metrics.save_for_all_ranks",
             action="store_true",
-            default=True,
+            default=False,
             help="""
-                Whether to save TensorBoard metrics only for rank 0 or for all ranks.
-                When pipeline_parallel_degree is > 1, this option uses the 0th rank of the last stage pipeline group,
-                which is the only stage that computes loss metrics.
+                Whether to save TensorBoard/Wandb metrics only for rank 0 or for all ranks.
+                When this option is False and pipeline_parallel_degree is > 1, the metrics
+                component uses the 0th rank of the last stage pipeline group, which is the
+                only stage that computes loss metrics.
             """,
         )
         self.parser.add_argument(
diff --git a/flame/models/activation_offloading.py b/flame/models/activation_offloading.py
@@ -151,11 +151,15 @@ def pack_tensor(activation: torch.Tensor) -> int:
             num_bytes = get_num_bytes_tensor(activation)
             tensor_id = get_tensor_id()
 
-            # only offload hefty bois if they're activations (our heuristic for that is to
-            # check if they're not params or buffers)!
-            if num_bytes >= self.min_tensor_size_bytes and (
-                not isinstance(activation, torch.nn.Parameter)
-                and not isinstance(activation, torch.nn.Buffer)
+            # only offload hefty bois if they're activations on CUDA (our heuristic
+            # for that is to check if they're not params or buffers)!
+            if (
+                activation.is_cuda
+                and num_bytes >= self.min_tensor_size_bytes
+                and (
+                    not isinstance(activation, torch.nn.Parameter)
+                    and not isinstance(activation, torch.nn.Buffer)
+                )
             ):
                 if self.use_streams:
                     # First, sync back and dereference previously offloaded tensors
diff --git a/flame/tools/utils.py b/flame/tools/utils.py
@@ -4,10 +4,18 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from torch import nn
 from torchtitan.tools.logging import logger
 
 
-def get_num_flop_per_token(num_params: int, model_config, seq_len) -> int:
+def get_nparams_and_flops(model: nn.Module, model_config, seq_len: int) -> tuple[int, int]:
+    nparams = sum(p.numel() for p in model.parameters())
+    nparams_embedding = sum(
+        sum(p.numel() for p in m.parameters())
+        for m in model.children()
+        if isinstance(m, nn.Embedding)
+    )
+    
     if hasattr(model_config, "num_heads"):
         num_heads = model_config.num_heads
     elif hasattr(model_config, "num_attention_heads"):
@@ -28,6 +36,6 @@ def get_num_flop_per_token(num_params: int, model_config, seq_len) -> int:
     #    but recomputation should not be counted in calculating MFU           (+0)
     # 3. each matmul performs 1 multiplication and 1 addition                 (*2)
     # 4. we follow the convention and do not account for sparsity in causal attention
-    flop_per_token = 6 * num_params + 12 * l * h * q * t
+    num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t
 
-    return flop_per_token
+    return nparams, num_flops_per_token
diff --git a/flame/train.py b/flame/train.py