NVIDIA
diff --git a/‎examples/nemo_run/common/in_memory_mmlu.py‎
Lines changed: 56 additions & 0 deletions b/‎examples/nemo_run/common/in_memory_mmlu.py‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎examples/nemo_run/llama_chat_template.txt‎ renamed to ‎examples/nemo_run/common/llama_chat_template.txt‎ b/‎examples/nemo_run/llama_chat_template.txt‎ renamed to ‎examples/nemo_run/common/llama_chat_template.txt‎
diff --git a/‎examples/nemo_run/process_lima.py‎ renamed to ‎examples/nemo_run/common/process_lima.py‎ b/‎examples/nemo_run/process_lima.py‎ renamed to ‎examples/nemo_run/common/process_lima.py‎
diff --git a/‎examples/nemo_run/common/process_openscience.py‎
Lines changed: 100 additions & 0 deletions b/‎examples/nemo_run/common/process_openscience.py‎
Lines changed: 100 additions & 0 deletions
diff --git a/‎examples/nemo_run/common/utils.py‎
Lines changed: 139 additions & 0 deletions b/‎examples/nemo_run/common/utils.py‎
Lines changed: 139 additions & 0 deletions
@@ -0,0 +1,56 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+from nemo.collections.llm.modelopt import setup_trainer_and_restore_model_with_modelopt_spec
+
+from modelopt.torch.export.plugins.nemo_run import _get_most_recent_ckpt
+from modelopt.torch.utils.plugins.megatron_mmlu import megatron_mmlu
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Run MMLU evaluation with ModelOpt Megatron model. Provide either --nemo_ckpt or --ckpt_dir"
+    )
+    parser.add_argument("--nemo_ckpt", type=str, required=False, help="Path to NeMo checkpoint.")
+    parser.add_argument(
+        "--ckpt_dir",
+        required=False,
+        type=str,
+        help="Checkpoint directory of 1 or more finetuned models",
+    )
+    parser.add_argument(
+        "--tensor_parallelism", type=int, default=1, help="Tensor parallelism size."
+    )
+    parser.add_argument(
+        "--pipeline_parallelism", type=int, default=1, help="Pipeline parallelism size."
+    )
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    assert args.nemo_ckpt or args.ckpt_dir, "Provide one of either --nemo_ckpt or --ckpt_dir."
+    ckpt_path = args.nemo_ckpt
+    if args.ckpt_dir:
+        ckpt_path = _get_most_recent_ckpt(args.ckpt_dir)
+    model, trainer = setup_trainer_and_restore_model_with_modelopt_spec(
+        ckpt_path,
+        tensor_model_parallel_size=args.tensor_parallelism,
+        pipeline_model_parallel_size=args.pipeline_parallelism,
+    )
+    tokenizer = model.tokenizer.tokenizer
+    megatron_mmlu(model.module, tokenizer)
@@ -0,0 +1,100 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+import os
+from pathlib import Path
+
+from datasets import load_dataset
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(description="Process nvidia/OpenScience dataset")
+    parser.add_argument("--output-dir", type=str, default=".")
+    return parser
+
+
+def convert_row_oai(row: dict):
+    return {
+        "messages": [
+            {"role": "user", "content": row["input"]},
+            {"role": "assistant", "content": row["output"]},
+        ]
+    }
+
+
+def process_subset(raw_dir, proc_dir):
+    ds = load_dataset(raw_dir)
+    ds = ds.map(convert_row_oai, remove_columns=["input", "output"])
+
+    split_ds = ds["train"].train_test_split(test_size=0.1)
+    split_ds["train"].to_json(os.path.join(proc_dir, "training.jsonl"))
+    split_ds["test"].to_json(os.path.join(proc_dir, "validation.jsonl"))
+
+
+# TODO remove below?
+def sample_openscience(raw_dir, proc_dir, sample_ratio=1):
+    """Process raw OpenScience data by subsampling the dataset by default, then
+    writing into train/val split with 99/1 ratio"""
+    files = os.listdir(raw_dir)
+    num_data = 0
+
+    for file in files:
+        # Open each jsonl
+        if file.endswith("jsonl"):
+            print(f"Sampling from {file}")
+            with (
+                open(os.path.join(raw_dir, file)) as f_raw,
+                open(os.path.join(proc_dir, "training.jsonl"), "a") as f_train,
+                open(os.path.join(proc_dir, "validation.jsonl"), "a") as f_val,
+            ):
+                for idx, line in enumerate(f_raw):
+                    if idx % sample_ratio != 0:
+                        continue
+                    data = json.loads(line)
+                    # convert dictionary to OpenAI chat: from {"input": "...", "output": "..."}
+                    # to [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]
+                    data = {
+                        "messages": [
+                            {"role": "user", "content": data["input"]},
+                            {"role": "assistant", "content": data["output"]},
+                        ]
+                    }
+
+                    if num_data % 100 == 0:
+                        f_val.write(json.dumps(data) + "\n")
+                    else:
+                        f_train.write(json.dumps(data) + "\n")
+                    num_data += 1
+
+
+if __name__ == "__main__":
+    args = get_parser().parse_args()
+    raw_dir = f"{args.output_dir}/openscience_raw"
+    proc_dir = f"{args.output_dir}/openscience_proc"
+
+    if not os.path.exists(raw_dir):
+        # download_hf_dataset("nvidia/OpenScience", raw_dir)
+        q235_subset = load_dataset("nvidia/OpenScience", data_files="OS-Q3-235B-4.jsonl")
+        q235_subset.save_to_disk(raw_dir)
+
+    if not os.path.exists(proc_dir):
+        Path(proc_dir).mkdir(exist_ok=True)
+        print("Processing OpenScience dataset")
+        process_subset(raw_dir, proc_dir)
+    else:
+        print(f"Processed OpenScience dataset exists in: {proc_dir}, skipped processing")
+    # process_openscience(raw_dir, proc_dir)
@@ -0,0 +1,139 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import subprocess
+from dataclasses import dataclass, field
+
+import nemo_run as run
+from nemo.collections import llm
+
+
+@dataclass
+class SlurmConfig:
+    """Configuration for SlurmExecutor."""
+
+    account: str = ""  # Your Slurm account
+    partition_cpu: str = ""  # Slurm CPU partition to use
+    partition_gpu: str = ""  # Slurm GPU partition to use
+    time: str = ""  # Job time limit (HH:MM:SS)
+    container_image: str = ""  # Container image for jobs
+    env_vars: dict[str, str] = field(default_factory=dict)  # Environment variables to set
+    container_mounts: list[str] = field(default_factory=list)  # Container mounts
+    use_local_tunnel: bool = False  # Set to True if running from within the cluster
+    host: str = ""  # Required for SSH tunnel: Slurm cluster hostname
+    user: str = ""  # Required for SSH tunnel: Your username
+    job_dir: str = ""  # Required for SSH tunnel: Directory to store runs on cluster
+    identity: str | None = None  # Optional for SSH tunnel: Path to SSH key for authentication
+
+    def __post_init__(self):
+        """Validate the configuration and raise descriptive errors."""
+        if not self.account:
+            raise ValueError("SlurmConfig.account must be set to your actual Slurm account")
+        if not self.partition_cpu:
+            raise ValueError("SlurmConfig.partition_cpu must be set")
+        if not self.partition_gpu:
+            raise ValueError("SlurmConfig.partition_gpu must be set")
+        if not self.time:
+            raise ValueError("SlurmConfig.time must be set to job time limit (e.g., '02:00:00')")
+        if not self.container_image:
+            raise ValueError("SlurmConfig.container_image must be set to container image for jobs")
+        if not self.use_local_tunnel:
+            # Only validate SSH tunnel settings if not using local tunnel
+            if not self.host:
+                raise ValueError(
+                    "SlurmConfig.host must be set to your actual cluster hostname when using SSH tunnel"
+                )
+            if not self.user:
+                raise ValueError(
+                    "SlurmConfig.user must be set to your actual username when using SSH tunnel"
+                )
+            if not self.job_dir:
+                raise ValueError(
+                    "SlurmConfig.job_dir must be set to directory for storing runs on cluster"
+                )
+
+        self.env_vars |= {
+            "CUDA_DEVICE_MAX_CONNECTIONS": "1",  # Disable GPU communication/computation overlap for performance
+            "TRANSFORMERS_OFFLINE": "1",  # Disable online downloads from HuggingFace
+            "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",  # Disable caching NCCL communication buffer memory
+            "NCCL_NVLS_ENABLE": "0",  # Disable NVLink SHARP to save memory
+        }
+
+
+def create_slurm_executor(
+    slurm_cfg: SlurmConfig, nodes: int = 1, ntasks_per_node: int = 1, num_gpus: int = 0
+):
+    # Configure tunnel
+    if slurm_cfg.use_local_tunnel:
+        # Use LocalTunnel when already on the cluster
+        tunnel = run.LocalTunnel(job_dir=slurm_cfg.job_dir)
+    else:
+        # Use SSH tunnel when launching from local machine
+        tunnel = run.SSHTunnel(
+            host=slurm_cfg.host,
+            user=slurm_cfg.user,
+            job_dir=slurm_cfg.job_dir,
+            identity=slurm_cfg.identity,  # can be None
+        )
+
+    if num_gpus > 0:
+        return run.SlurmExecutor(
+            account=slurm_cfg.account,
+            partition=slurm_cfg.partition_gpu,
+            ntasks_per_node=ntasks_per_node,
+            gpus_per_node=num_gpus,
+            nodes=nodes,
+            tunnel=tunnel,
+            container_image=slurm_cfg.container_image,
+            container_mounts=slurm_cfg.container_mounts,
+            time=slurm_cfg.time,
+            packager=run.GitArchivePackager(),
+            mem="0",
+            gres=f"gpu:{num_gpus}",
+        )
+    else:
+        return run.SlurmExecutor(
+            account=slurm_cfg.account,
+            partition=slurm_cfg.partition_cpu,
+            nodes=nodes,
+            tunnel=tunnel,
+            container_image=slurm_cfg.container_image,
+            container_mounts=slurm_cfg.container_mounts,
+            time=slurm_cfg.time,
+            packager=run.GitArchivePackager(),
+            mem="0",
+        )
+
+
+def get_finetune_recipe(recipe_name: str):
+    if not hasattr(getattr(llm, recipe_name), "finetune_recipe"):
+        raise ValueError(f"Recipe {recipe_name} does not have a Fine-Tuning recipe")
+    return getattr(llm, recipe_name).finetune_recipe(peft_scheme=None)
+
+
+def read_chat_template(template_path: str):
+    with open(template_path) as f:
+        return f.read().strip()
+
+
+def download_hf_dataset(dataset_name: str, output_dir: str | None = None):
+    """Download a dataset from HuggingFace Hub using huggingface-cli."""
+    cmd = ["huggingface-cli", "download", dataset_name, "--repo-type", "dataset"]
+
+    if output_dir:
+        cmd.extend(["--local-dir", output_dir])
+
+    subprocess.run(cmd, check=True)
+    print(f"Successfully downloaded dataset: {dataset_name}")