update readme

jenchen13 · jenchen13 · commit 16f61f1211f5 · 2025-09-17T20:03:19.000Z
Signed-off-by: Jennifer Chen &lt;jennifchen@nvidia.com&gt;
diff --git a/examples/nemo_run/qat/README.md b/examples/nemo_run/qat/README.md
@@ -48,6 +48,8 @@ QAT of Qwen3-8B NVFP4 recovers most of the accuracy on the MMLU benchmark after
 | Qwen3-8B NVFP4            | 70.3    |
 | Qwen3-8B NVFP4 after QAT  | 72.8    |
 
+The resulting exported checkpoint also is much smaller in memory at 6.4GB compared to the original BF16 checkpoint which is 16.4 GB.
+
 ## Usage
 
 ### Prerequisites
diff --git a/examples/nemo_run/qat/nemo_qat_flow.py b/examples/nemo_run/qat/nemo_qat_flow.py
@@ -140,7 +140,7 @@ def get_args():
         action="store_true",
         default=False,
     )
-    parser.add_argument("--tensor_parallelism", type=int, default=1)
+    parser.add_argument("--tensor_parallelism", type=int, default=2)
     parser.add_argument("--pipeline_parallelism", type=int, default=1)
     return parser.parse_args()
 
@@ -375,7 +375,7 @@ def main(args):
     SEQUENCE_LENGTH = 4096
     MBS = 1
     GBS = 512
-    TRAIN_STEPS = 400
+    TRAIN_STEPS = 200
     VAL_INTERVAL = 50
     # # # # # # # # # # # # # # # # # # # # # #