Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions modelopt/torch/opt/plugins/mcore_dist_checkpointing.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from typing import Any

import torch
import yaml
from megatron.core import dist_checkpointing, mpu
from megatron.core.dist_checkpointing.serialization import get_default_load_sharded_strategy
from megatron.core.dist_checkpointing.strategies.common import COMMON_STATE_FNAME
Expand All @@ -35,6 +36,21 @@

SUPPORTED_WRAPPERS[Float16Module] = "module"

DROP_SUBSTRINGS = [
"fp4",
"fp8",
"tp_",
"parallel",
"cuda_graph",
"init_",
"cpu",
"recompute",
"inference",
"pipeline",
"comm",
"batch",
]


def remove_per_module_state(
modelopt_state: dict[str, Any],
Expand Down Expand Up @@ -122,6 +138,27 @@ def save_sharded_modelopt_state(
sharded_strategy: configures sharded tensors saving behavior and backend
prefix: the prefix to add to the modelopt_state keys ("model." for NeMo)
"""

def _parse_transformer_config(transformer_config: dict) -> dict:
config = {}

for k, v in transformer_config.items():
if any(substring in k for substring in DROP_SUBSTRINGS):
continue
if isinstance(v, (bool, int, str)):
config[k] = v
else:
config[k] = str(v)

return config

if dist.is_master():
run_config_name = f"{checkpoint_name}/modelopt_run_config.yaml"
config_dict = _parse_transformer_config(copy.deepcopy(model[0].config.__dict__))
Copy link

@ananthsub ananthsub Sep 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ChenhanYu I'm seeing an issue with the deepcopy

these fields are set on the config during training:
https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/8bb2309034bf2eb27783293cd228d86407db1cf9/src/megatron/bridge/training/setup.py#L267C5-L295

https://github.com/NVIDIA/Megatron-LM/blob/c223178a33e239312c3afd81a8adc27cd9ca698c/megatron/training/training.py#L2038-L2057

These are bound methods to DDP class instances which have pytorch process groups initialized as member variables. the deepcopy here is going into deepcopying the class itself, causing issues like this:

b-pretrain/0 [rank0]:   File "/opt/TensorRT-Model-Optimizer/modelopt/torch/opt/plugins/mcore_dist_checkpointing.py", line 157, in save_sharded_modelopt_state
b-pretrain/0 [rank0]:     config_dict = _parse_transformer_config(copy.deepcopy(model[0].config.__dict__))
b-pretrain/0 [rank0]:                                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 136, in deepcopy
b-pretrain/0 [rank0]:     y = copier(x, memo)
b-pretrain/0 [rank0]:         ^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 221, in _deepcopy_dict
b-pretrain/0 [rank0]:     y[deepcopy(key, memo)] = deepcopy(value, memo)
b-pretrain/0 [rank0]:                              ^^^^^^^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 136, in deepcopy
b-pretrain/0 [rank0]:     y = copier(x, memo)
b-pretrain/0 [rank0]:         ^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 226, in _deepcopy_method
b-pretrain/0 [rank0]:     return type(x)(x.__func__, deepcopy(x.__self__, memo))
b-pretrain/0 [rank0]:                                ^^^^^^^^^^^^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 162, in deepcopy
b-pretrain/0 [rank0]:     y = _reconstruct(x, memo, *rv)
b-pretrain/0 [rank0]:         ^^^^^^^^^^^^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 259, in _reconstruct
b-pretrain/0 [rank0]:     state = deepcopy(state, memo)
b-pretrain/0 [rank0]:             ^^^^^^^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 136, in deepcopy
b-pretrain/0 [rank0]:     y = copier(x, memo)
b-pretrain/0 [rank0]:         ^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 221, in _deepcopy_dict
b-pretrain/0 [rank0]:     y[deepcopy(key, memo)] = deepcopy(value, memo)
b-pretrain/0 [rank0]:                              ^^^^^^^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 136, in deepcopy
b-pretrain/0 [rank0]:     y = copier(x, memo)
b-pretrain/0 [rank0]:         ^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 196, in _deepcopy_list
b-pretrain/0 [rank0]:     append(deepcopy(a, memo))
b-pretrain/0 [rank0]:            ^^^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 162, in deepcopy
b-pretrain/0 [rank0]:     y = _reconstruct(x, memo, *rv)
b-pretrain/0 [rank0]:         ^^^^^^^^^^^^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 259, in _reconstruct
b-pretrain/0 [rank0]:     state = deepcopy(state, memo)
b-pretrain/0 [rank0]:             ^^^^^^^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 136, in deepcopy
b-pretrain/0 [rank0]:     y = copier(x, memo)
b-pretrain/0 [rank0]:         ^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 221, in _deepcopy_dict
b-pretrain/0 [rank0]:     y[deepcopy(key, memo)] = deepcopy(value, memo)
b-pretrain/0 [rank0]:                              ^^^^^^^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 136, in deepcopy
b-pretrain/0 [rank0]:     y = copier(x, memo)
b-pretrain/0 [rank0]:         ^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 221, in _deepcopy_dict
b-pretrain/0 [rank0]:     y[deepcopy(key, memo)] = deepcopy(value, memo)
b-pretrain/0 [rank0]:                              ^^^^^^^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 162, in deepcopy
b-pretrain/0 [rank0]:     y = _reconstruct(x, memo, *rv)
b-pretrain/0 [rank0]:         ^^^^^^^^^^^^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 259, in _reconstruct
b-pretrain/0 [rank0]:     state = deepcopy(state, memo)
b-pretrain/0 [rank0]:             ^^^^^^^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 136, in deepcopy
b-pretrain/0 [rank0]:     y = copier(x, memo)
b-pretrain/0 [rank0]:         ^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 221, in _deepcopy_dict
b-pretrain/0 [rank0]:     y[deepcopy(key, memo)] = deepcopy(value, memo)
b-pretrain/0 [rank0]:                              ^^^^^^^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 136, in deepcopy
b-pretrain/0 [rank0]:     y = copier(x, memo)
b-pretrain/0 [rank0]:         ^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 221, in _deepcopy_dict
b-pretrain/0 [rank0]:     y[deepcopy(key, memo)] = deepcopy(value, memo)
b-pretrain/0 [rank0]:                              ^^^^^^^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 162, in deepcopy
b-pretrain/0 [rank0]:     y = _reconstruct(x, memo, *rv)
b-pretrain/0 [rank0]:         ^^^^^^^^^^^^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 259, in _reconstruct
b-pretrain/0 [rank0]:     state = deepcopy(state, memo)
b-pretrain/0 [rank0]:             ^^^^^^^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 136, in deepcopy
b-pretrain/0 [rank0]:     y = copier(x, memo)
b-pretrain/0 [rank0]:         ^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 221, in _deepcopy_dict
b-pretrain/0 [rank0]:     y[deepcopy(key, memo)] = deepcopy(value, memo)
b-pretrain/0 [rank0]:                              ^^^^^^^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 136, in deepcopy
b-pretrain/0 [rank0]:     y = copier(x, memo)
b-pretrain/0 [rank0]:         ^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 221, in _deepcopy_dict
b-pretrain/0 [rank0]:     y[deepcopy(key, memo)] = deepcopy(value, memo)
b-pretrain/0 [rank0]:                              ^^^^^^^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 162, in deepcopy
b-pretrain/0 [rank0]:     y = _reconstruct(x, memo, *rv)
b-pretrain/0 [rank0]:         ^^^^^^^^^^^^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 259, in _reconstruct
b-pretrain/0 [rank0]:     state = deepcopy(state, memo)
b-pretrain/0 [rank0]:             ^^^^^^^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 136, in deepcopy
b-pretrain/0 [rank0]:     y = copier(x, memo)
b-pretrain/0 [rank0]:         ^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 221, in _deepcopy_dict
b-pretrain/0 [rank0]:     y[deepcopy(key, memo)] = deepcopy(value, memo)
b-pretrain/0 [rank0]:                              ^^^^^^^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 136, in deepcopy
b-pretrain/0 [rank0]:     y = copier(x, memo)
b-pretrain/0 [rank0]:         ^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 221, in _deepcopy_dict
b-pretrain/0 [rank0]:     y[deepcopy(key, memo)] = deepcopy(value, memo)
b-pretrain/0 [rank0]:                              ^^^^^^^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 162, in deepcopy
b-pretrain/0 [rank0]:     y = _reconstruct(x, memo, *rv)
b-pretrain/0 [rank0]:         ^^^^^^^^^^^^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 259, in _reconstruct
b-pretrain/0 [rank0]:     state = deepcopy(state, memo)
b-pretrain/0 [rank0]:             ^^^^^^^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 136, in deepcopy
b-pretrain/0 [rank0]:     y = copier(x, memo)
b-pretrain/0 [rank0]:         ^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 221, in _deepcopy_dict
b-pretrain/0 [rank0]:     y[deepcopy(key, memo)] = deepcopy(value, memo)
b-pretrain/0 [rank0]:                              ^^^^^^^^^^^^^^^^^^^^^
b-pretrain/0 [rank0]:   File "/usr/lib/python3.12/copy.py", line 151, in deepcopy
b-pretrain/0 [rank0]:     rv = reductor(4)
b-pretrain/0 [rank0]:          ^^^^^^^^^^^
b-pretrain/0 [rank0]: TypeError: cannot pickle 'torch._C._distributed_c10d.ProcessGroup' object

in megatron bridge, we have a yaml representer for this to serialize our configs to yaml:

https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/8bb2309034bf2eb27783293cd228d86407db1cf9/src/megatron/bridge/utils/yaml_utils.py#L24-L87

so we don't see this when serializing our overall job config to yaml (which includes the transformer config):

https://github.com/NVIDIA-NeMo/Megatron-Bridge/blob/8bb2309034bf2eb27783293cd228d86407db1cf9/src/megatron/bridge/training/checkpointing.py#L653

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. The deepcopy here is likely not necessary. Let me remove it.

config_dict["nvidia_modelopt_version"] = modelopt.__version__
with open(run_config_name, "w") as f:
yaml.dump(config_dict, f, default_flow_style=False)

if not mto.ModeloptStateManager.is_converted(model[0]):
return
if len(model) > 1:
Expand Down
87 changes: 87 additions & 0 deletions modelopt/torch/utils/plugins/megatron_preprocess_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,14 @@
```
"""

import argparse
import json
import multiprocessing
import sys
from pathlib import Path

import requests
from datasets import load_dataset
Comment on lines +40 to +41
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

🧩 Analysis chain

Ensure packaging lists new dependencies.

requests and datasets are new runtime deps for the CLI. Verify they’re declared (pyproject/requirements/extras).

Also applies to: 206-289


🏁 Script executed:

#!/bin/bash
# Verify packaging declares 'datasets' and 'requests'
rg -nP -C2 '^(datasets|requests)\b' -g 'pyproject.toml' -g 'requirements*.txt' -g 'setup.cfg' -g 'setup.py'

Length of output: 1129


Add 'requests' and 'datasets' to project packaging/runtime deps

modelopt/torch/utils/plugins/megatron_preprocess_data.py imports requests and datasets (lines 40–41; also referenced ~206–289). Search shows only 'datasets' in examples/*/requirements.txt and no top-level declaration for either. Add both to pyproject.toml (or setup.cfg / setup.py) or the CLI extras so the CLI runtime installs them.

🤖 Prompt for AI Agents
In modelopt/torch/utils/plugins/megatron_preprocess_data.py around lines 40-41
(and usages ~206-289), the module imports and uses the third-party packages
'requests' and 'datasets' but they are not declared in the top-level project
dependencies; add both 'requests' and 'datasets' to the project's
packaging/runtime dependencies (e.g., pyproject.toml [project]/dependencies or
setup.cfg/setup.py install_requires) or include them in the CLI extras so that
the CLI runtime installs them; pick appropriate minimal version constraints
consistent with examples/*/requirements.txt (or mirror examples) and run
dependency install/tests to verify imports resolve.

from megatron.core.datasets import indexed_dataset
from transformers import AutoTokenizer

Expand Down Expand Up @@ -198,3 +201,87 @@ def megatron_preprocess_data(
final_enc_len += num_tokens

print(f">>> Total number of tokens: {final_enc_len}")


def main():
"""Sample main function to process large data for pretraining.

Example usage:

>>> python megatron_preprocess_data.py \
--dataset "nvidia/Nemotron-Pretraining-Dataset-sample" \
--tokenizer "meta-llama/Llama-3.2-1B-Instruct" \
--output_dir "./processed_data"
"""
parser = argparse.ArgumentParser(prog="megatron_preprocess_data")
parser.add_argument("--input_path", type=str, default=None, help="Input path.")
parser.add_argument(
"--dataset",
type=str,
default="nvidia/Nemotron-Pretraining-Dataset-sample",
help="Hugging Face Hub dataset name or path",
)
parser.add_argument("--subset", type=str, default=None, help="Hugging Face Hub dataset subset")
parser.add_argument("--split", type=str, default="train", help="Hugging Face Hub dataset split")
parser.add_argument(
Comment on lines +225 to +226
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Fix default --split: make it None to process all splits when omitted

Current default "train" contradicts the PR description (“all subsets and splits if subset/split omitted”) and filters to train-only. Set default to None and keep the existing filter logic.

Apply this diff:

-    parser.add_argument("--split", type=str, default="train", help="Hugging Face Hub dataset split")
+    parser.add_argument(
+        "--split",
+        type=str,
+        default=None,
+        help="Hugging Face Hub dataset split (process all if omitted)",
+    )
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
parser.add_argument("--split", type=str, default="train", help="Hugging Face Hub dataset split")
parser.add_argument(
parser.add_argument(
"--split",
type=str,
default=None,
help="Hugging Face Hub dataset split (process all if omitted)",
)
parser.add_argument(
🤖 Prompt for AI Agents
In modelopt/torch/utils/plugins/megatron_preprocess_data.py around lines
225-226, the --split argument currently defaults to "train" which incorrectly
limits processing to the train split; change the parser.add_argument for
"--split" to use default=None so that when omitted the existing logic will
process all splits/subsets, leaving the current filter logic unchanged.

"--output_dir", type=str, default="./processed_data", help="Output directory"
)
parser.add_argument("--tokenizer", type=str, required=True, help="Tokenizer name or path")
parser.add_argument("--json_keys", nargs="+", default=["text"], help="JSON keys to tokenize")
parser.add_argument("--append_eod", action="store_true", help="Append <eod> token")
parser.add_argument(
"--max_sequence_length", type=int, default=None, help="Maximum sequence length"
)
parser.add_argument("--workers", type=int, default=8, help="Number of worker processes")
parser.add_argument("--log_interval", type=int, default=1000, help="Log interval")
args = parser.parse_args()

if args.input_path is None:
args.input_path = []

response = requests.get(
"https://datasets-server.huggingface.co/splits?dataset={}".format(args.dataset),
timeout=10,
)

for entry in response.json()["splits"]:
skip_processing = False
name = entry["dataset"]
subset = entry.get("config", None)
split = entry["split"]

if args.subset is not None and args.subset != subset:
skip_processing = True
if args.split is not None and args.split != split:
skip_processing = True

print(f"Loading dataset {name} with subset {subset} and split {split}")
dataset = load_dataset(name, subset, split=split)

for key in args.json_keys:
if key not in dataset.features:
print(f"Key {key} not found in dataset features. Skipping...")
skip_processing = True
break

if skip_processing:
continue

json_file_path = args.output_dir + "/" + name + "_" + subset + "_" + split + ".jsonl"
dataset.to_json(json_file_path)
args.input_path += [json_file_path]

megatron_preprocess_data(
input_path=args.input_path,
output_dir=args.output_dir,
tokenizer_name_or_path=args.tokenizer,
json_keys=args.json_keys,
append_eod=args.append_eod,
max_sequence_length=args.max_sequence_length,
workers=args.workers,
log_interval=args.log_interval,
)


if __name__ == "__main__":
main()
Loading