fix: feedback

ChenhanYu · ChenhanYu · commit 9e7c1a48604a · 2025-09-18T14:30:32.000-07:00
Signed-off-by: Chenhan Yu &lt;chenhany@nvidia.com&gt;
diff --git a/modelopt/torch/opt/plugins/mcore_dist_checkpointing.py b/modelopt/torch/opt/plugins/mcore_dist_checkpointing.py
@@ -36,6 +36,21 @@
 
 SUPPORTED_WRAPPERS[Float16Module] = "module"
 
+DROP_SUBSTRINGS = [
+    "fp4",
+    "fp8",
+    "tp_",
+    "parallel",
+    "cuda_graph",
+    "init_",
+    "cpu",
+    "recompute",
+    "inference",
+    "pipeline",
+    "comm",
+    "batch",
+]
+
 
 def remove_per_module_state(
     modelopt_state: dict[str, Any],
@@ -126,18 +141,15 @@ def save_sharded_modelopt_state(
 
     def _parse_transformer_config(transformer_config: dict) -> dict:
         config = {}
+
         for k, v in transformer_config.items():
+            if any(substring in k for substring in DROP_SUBSTRINGS):
+                continue
             if isinstance(v, (bool, int, str)):
                 config[k] = v
             else:
                 config[k] = str(v)
-        config = {k: v for k, v in config.items() if "fp4" not in k and "fp8" not in k}
-        config = {k: v for k, v in config.items() if "tp_" not in k and "parallel" not in k}
-        config = {k: v for k, v in config.items() if "cuda_graph" not in k}
-        config = {k: v for k, v in config.items() if "init_" not in k and "cpu" not in k}
-        config = {k: v for k, v in config.items() if "recompute" not in k and "inference" not in k}
-        config = {k: v for k, v in config.items() if "pipeline" not in k and "comm" not in k}
-        config = {k: v for k, v in config.items() if "batch" not in k}
+
         return config
 
     if dist.is_master():
diff --git a/modelopt/torch/utils/plugins/megatron_preprocess_data.py b/modelopt/torch/utils/plugins/megatron_preprocess_data.py
@@ -210,13 +210,16 @@ def main():
 
     >>> python megatron_preprocess_data.py \
             --dataset "nvidia/Nemotron-Pretraining-Dataset-sample" \
-            --tokenizer "nvidia/Nemotron-Pretraining-Tokenizer" \
+            --tokenizer "meta-llama/Llama-3.2-1B-Instruct" \
             --output_dir "./processed_data"
     """
     parser = argparse.ArgumentParser(prog="megatron_preprocess_data")
     parser.add_argument("--input_path", type=str, default=None, help="Input path.")
     parser.add_argument(
-        "--dataset", type=str, default=None, help="Hugging Face Hub dataset name or path"
+        "--dataset",
+        type=str,
+        default="nvidia/Nemotron-Pretraining-Dataset-sample",
+        help="Hugging Face Hub dataset name or path",
     )
     parser.add_argument("--subset", type=str, default=None, help="Hugging Face Hub dataset subset")
     parser.add_argument("--split", type=str, default="train", help="Hugging Face Hub dataset split")
@@ -225,7 +228,7 @@ def main():
     )
     parser.add_argument("--tokenizer", type=str, required=True, help="Tokenizer name or path")
     parser.add_argument("--json_keys", nargs="+", default=["text"], help="JSON keys to tokenize")
-    parser.add_argument("--append_eod", type=bool, default=False, help="Append <eod> token")
+    parser.add_argument("--append_eod", action="store_true", help="Append <eod> token")
     parser.add_argument(
         "--max_sequence_length", type=int, default=None, help="Maximum sequence length"
     )
@@ -235,8 +238,6 @@ def main():
 
     if args.input_path is None:
         args.input_path = []
-        if args.dataset is None:
-            args.dataset = "nvidia/Nemotron-Pretraining-Dataset-sample"
 
         response = requests.get(
             "https://datasets-server.huggingface.co/splits?dataset={}".format(args.dataset),
@@ -250,9 +251,9 @@ def main():
             split = entry["split"]
 
             if args.subset is not None and args.subset != subset:
-                continue
+                skip_processing = True
             if args.split is not None and args.split != split:
-                continue
+                skip_processing = True
 
             print(f"Loading dataset {name} with subset {subset} and split {split}")
             dataset = load_dataset(name, subset, split=split)