-
Notifications
You must be signed in to change notification settings - Fork 162
Slurm support for QAT Simplified Flow + Qwen3-8B recipe #285
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import argparse | ||
|
||
from nemo.collections.llm.modelopt import setup_trainer_and_restore_model_with_modelopt_spec | ||
|
||
from modelopt.torch.export.plugins.nemo_run import _get_most_recent_ckpt | ||
from modelopt.torch.utils.plugins.megatron_mmlu import megatron_mmlu | ||
|
||
|
||
def parse_args(): | ||
parser = argparse.ArgumentParser( | ||
description=( | ||
"Run MMLU evaluation with ModelOpt Megatron model. Provide either --nemo_ckpt" | ||
"or --finetuned_ckpt_dir" | ||
) | ||
) | ||
group = parser.add_mutually_exclusive_group(required=True) | ||
group.add_argument("--nemo_ckpt", type=str, required=False, help="Path to NeMo checkpoint.") | ||
group.add_argument( | ||
"--finetuned_ckpt_dir", | ||
required=False, | ||
type=str, | ||
help="Checkpoint directory of 1 or more finetuned models", | ||
) | ||
parser.add_argument( | ||
"--tensor_parallelism", type=int, default=1, help="Tensor parallelism size." | ||
) | ||
parser.add_argument( | ||
"--pipeline_parallelism", type=int, default=1, help="Pipeline parallelism size." | ||
) | ||
return parser.parse_args() | ||
|
||
|
||
if __name__ == "__main__": | ||
args = parse_args() | ||
ckpt_path = args.nemo_ckpt | ||
if args.finetuned_ckpt_dir: | ||
ckpt_path = _get_most_recent_ckpt(args.finetuned_ckpt_dir) | ||
model, trainer = setup_trainer_and_restore_model_with_modelopt_spec( | ||
ckpt_path, | ||
tensor_model_parallel_size=args.tensor_parallelism, | ||
pipeline_model_parallel_size=args.pipeline_parallelism, | ||
devices=args.tensor_parallelism * args.pipeline_parallelism, | ||
) | ||
tokenizer = model.tokenizer.tokenizer | ||
megatron_mmlu(model.module, tokenizer) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import argparse | ||
import os | ||
from pathlib import Path | ||
|
||
from datasets import load_dataset | ||
|
||
jenchen13 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
def get_parser(): | ||
parser = argparse.ArgumentParser(description="Process nvidia/OpenScience dataset") | ||
parser.add_argument("--output-dir", type=str, default=".") | ||
return parser | ||
|
||
|
||
def convert_row_oai(row: dict): | ||
return { | ||
"messages": [ | ||
{"role": "user", "content": row["input"]}, | ||
{"role": "assistant", "content": row["output"]}, | ||
] | ||
} | ||
|
||
|
||
def process_subset(raw_dir, proc_dir): | ||
ds = load_dataset(raw_dir) | ||
ds = ds.map(convert_row_oai, remove_columns=["input", "output"]) | ||
|
||
split_ds = ds["train"].train_test_split(test_size=0.1) | ||
split_ds["train"].to_json(os.path.join(proc_dir, "training.jsonl")) | ||
split_ds["test"].to_json(os.path.join(proc_dir, "validation.jsonl")) | ||
|
||
|
||
if __name__ == "__main__": | ||
args = get_parser().parse_args() | ||
raw_dir = f"{args.output_dir}/openscience_raw" | ||
proc_dir = f"{args.output_dir}/openscience_proc" | ||
|
||
if not os.path.exists(raw_dir): | ||
q235_subset = load_dataset("nvidia/OpenScience", data_files="OS-Q3-235B-4.jsonl") | ||
q235_subset.save_to_disk(raw_dir) | ||
|
||
if not os.path.exists(proc_dir): | ||
Path(proc_dir).mkdir(exist_ok=True) | ||
print("Processing OpenScience dataset") | ||
process_subset(raw_dir, proc_dir) | ||
else: | ||
print(f"Processed OpenScience dataset exists in: {proc_dir}, skipped processing") |
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,139 @@ | ||||||||||||||||||||||||||||||
# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||||||||||||||||||||||||||||||
# SPDX-License-Identifier: Apache-2.0 | ||||||||||||||||||||||||||||||
# | ||||||||||||||||||||||||||||||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||||||||||||||||||||||||||||||
# you may not use this file except in compliance with the License. | ||||||||||||||||||||||||||||||
# You may obtain a copy of the License at | ||||||||||||||||||||||||||||||
# | ||||||||||||||||||||||||||||||
# http://www.apache.org/licenses/LICENSE-2.0 | ||||||||||||||||||||||||||||||
# | ||||||||||||||||||||||||||||||
# Unless required by applicable law or agreed to in writing, software | ||||||||||||||||||||||||||||||
# distributed under the License is distributed on an "AS IS" BASIS, | ||||||||||||||||||||||||||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||||||||||||||||||||||||||
# See the License for the specific language governing permissions and | ||||||||||||||||||||||||||||||
# limitations under the License. | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
import subprocess | ||||||||||||||||||||||||||||||
from dataclasses import dataclass, field | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
import nemo_run as run | ||||||||||||||||||||||||||||||
from nemo.collections import llm | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
@dataclass | ||||||||||||||||||||||||||||||
class SlurmConfig: | ||||||||||||||||||||||||||||||
"""Configuration for SlurmExecutor.""" | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
account: str = "" # Your Slurm account | ||||||||||||||||||||||||||||||
partition_cpu: str = "" # Slurm CPU partition to use | ||||||||||||||||||||||||||||||
partition_gpu: str = "" # Slurm GPU partition to use | ||||||||||||||||||||||||||||||
time: str = "" # Job time limit (HH:MM:SS) | ||||||||||||||||||||||||||||||
container_image: str = "" # Container image for jobs | ||||||||||||||||||||||||||||||
env_vars: dict[str, str] = field(default_factory=dict) # Environment variables to set | ||||||||||||||||||||||||||||||
container_mounts: list[str] = field(default_factory=list) # Container mounts | ||||||||||||||||||||||||||||||
use_local_tunnel: bool = False # Set to True if running from within the cluster | ||||||||||||||||||||||||||||||
host: str = "" # Required for SSH tunnel: Slurm cluster hostname | ||||||||||||||||||||||||||||||
user: str = "" # Required for SSH tunnel: Your username | ||||||||||||||||||||||||||||||
job_dir: str = "" # Required for SSH tunnel: Directory to store runs on cluster | ||||||||||||||||||||||||||||||
identity: str | None = None # Optional for SSH tunnel: Path to SSH key for authentication | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
def __post_init__(self): | ||||||||||||||||||||||||||||||
"""Validate the configuration and raise descriptive errors.""" | ||||||||||||||||||||||||||||||
if not self.account: | ||||||||||||||||||||||||||||||
raise ValueError("SlurmConfig.account must be set to your actual Slurm account") | ||||||||||||||||||||||||||||||
if not self.partition_cpu: | ||||||||||||||||||||||||||||||
raise ValueError("SlurmConfig.partition_cpu must be set") | ||||||||||||||||||||||||||||||
if not self.partition_gpu: | ||||||||||||||||||||||||||||||
raise ValueError("SlurmConfig.partition_gpu must be set") | ||||||||||||||||||||||||||||||
if not self.time: | ||||||||||||||||||||||||||||||
raise ValueError("SlurmConfig.time must be set to job time limit (e.g., '02:00:00')") | ||||||||||||||||||||||||||||||
if not self.container_image: | ||||||||||||||||||||||||||||||
raise ValueError("SlurmConfig.container_image must be set to container image for jobs") | ||||||||||||||||||||||||||||||
if not self.use_local_tunnel: | ||||||||||||||||||||||||||||||
# Only validate SSH tunnel settings if not using local tunnel | ||||||||||||||||||||||||||||||
if not self.host: | ||||||||||||||||||||||||||||||
raise ValueError( | ||||||||||||||||||||||||||||||
"SlurmConfig.host must be set to your actual cluster hostname when using SSH tunnel" | ||||||||||||||||||||||||||||||
) | ||||||||||||||||||||||||||||||
if not self.user: | ||||||||||||||||||||||||||||||
raise ValueError( | ||||||||||||||||||||||||||||||
"SlurmConfig.user must be set to your actual username when using SSH tunnel" | ||||||||||||||||||||||||||||||
) | ||||||||||||||||||||||||||||||
if not self.job_dir: | ||||||||||||||||||||||||||||||
raise ValueError( | ||||||||||||||||||||||||||||||
"SlurmConfig.job_dir must be set to directory for storing runs on cluster" | ||||||||||||||||||||||||||||||
) | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
self.env_vars |= { | ||||||||||||||||||||||||||||||
"CUDA_DEVICE_MAX_CONNECTIONS": "1", # Disable GPU communication/computation overlap for performance | ||||||||||||||||||||||||||||||
"TRANSFORMERS_OFFLINE": "1", # Disable online downloads from HuggingFace | ||||||||||||||||||||||||||||||
"TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory | ||||||||||||||||||||||||||||||
"NCCL_NVLS_ENABLE": "0", # Disable NVLink SHARP to save memory | ||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||
Comment on lines
+67
to
+72
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don’t override caller-provided env vars; reverse the merge.
- self.env_vars |= {
- "CUDA_DEVICE_MAX_CONNECTIONS": "1", # Disable GPU communication/computation overlap for performance
- "TRANSFORMERS_OFFLINE": "1", # Disable online downloads from HuggingFace
- "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
- "NCCL_NVLS_ENABLE": "0", # Disable NVLink SHARP to save memory
- }
+ defaults = {
+ "CUDA_DEVICE_MAX_CONNECTIONS": "1", # Disable GPU communication/computation overlap for performance
+ "TRANSFORMERS_OFFLINE": "1", # Disable online downloads from HuggingFace
+ "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
+ "NCCL_NVLS_ENABLE": "0", # Disable NVLink SHARP to save memory
+ }
+ # User-specified values take precedence
+ self.env_vars = defaults | self.env_vars 📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents
|
||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
def create_slurm_executor( | ||||||||||||||||||||||||||||||
slurm_cfg: SlurmConfig, nodes: int = 1, ntasks_per_node: int = 1, num_gpus: int = 0 | ||||||||||||||||||||||||||||||
): | ||||||||||||||||||||||||||||||
# Configure tunnel | ||||||||||||||||||||||||||||||
if slurm_cfg.use_local_tunnel: | ||||||||||||||||||||||||||||||
# Use LocalTunnel when already on the cluster | ||||||||||||||||||||||||||||||
tunnel = run.LocalTunnel(job_dir=slurm_cfg.job_dir) | ||||||||||||||||||||||||||||||
else: | ||||||||||||||||||||||||||||||
# Use SSH tunnel when launching from local machine | ||||||||||||||||||||||||||||||
tunnel = run.SSHTunnel( | ||||||||||||||||||||||||||||||
host=slurm_cfg.host, | ||||||||||||||||||||||||||||||
user=slurm_cfg.user, | ||||||||||||||||||||||||||||||
job_dir=slurm_cfg.job_dir, | ||||||||||||||||||||||||||||||
identity=slurm_cfg.identity, # can be None | ||||||||||||||||||||||||||||||
) | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
if num_gpus > 0: | ||||||||||||||||||||||||||||||
return run.SlurmExecutor( | ||||||||||||||||||||||||||||||
account=slurm_cfg.account, | ||||||||||||||||||||||||||||||
partition=slurm_cfg.partition_gpu, | ||||||||||||||||||||||||||||||
ntasks_per_node=ntasks_per_node, | ||||||||||||||||||||||||||||||
gpus_per_node=num_gpus, | ||||||||||||||||||||||||||||||
nodes=nodes, | ||||||||||||||||||||||||||||||
tunnel=tunnel, | ||||||||||||||||||||||||||||||
container_image=slurm_cfg.container_image, | ||||||||||||||||||||||||||||||
container_mounts=slurm_cfg.container_mounts, | ||||||||||||||||||||||||||||||
time=slurm_cfg.time, | ||||||||||||||||||||||||||||||
packager=run.GitArchivePackager(), | ||||||||||||||||||||||||||||||
mem="0", | ||||||||||||||||||||||||||||||
gres=f"gpu:{num_gpus}", | ||||||||||||||||||||||||||||||
) | ||||||||||||||||||||||||||||||
else: | ||||||||||||||||||||||||||||||
Comment on lines
+75
to
+106
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💡 Verification agent 🧩 Analysis chainPropagate env vars to Slurm jobs.
- return run.SlurmExecutor(
+ return run.SlurmExecutor(
account=slurm_cfg.account,
partition=slurm_cfg.partition_gpu,
ntasks_per_node=ntasks_per_node,
gpus_per_node=num_gpus,
nodes=nodes,
tunnel=tunnel,
container_image=slurm_cfg.container_image,
container_mounts=slurm_cfg.container_mounts,
time=slurm_cfg.time,
packager=run.GitArchivePackager(),
mem="0",
gres=f"gpu:{num_gpus}",
+ env_vars=slurm_cfg.env_vars, # verify parameter name: env_vars/env
)
@@
- return run.SlurmExecutor(
+ return run.SlurmExecutor(
account=slurm_cfg.account,
partition=slurm_cfg.partition_cpu,
nodes=nodes,
tunnel=tunnel,
container_image=slurm_cfg.container_image,
container_mounts=slurm_cfg.container_mounts,
time=slurm_cfg.time,
packager=run.GitArchivePackager(),
mem="0",
+ env_vars=slurm_cfg.env_vars, # verify parameter name: env_vars/env
) If constructor doesn’t support it, set on the instance if available (e.g., Also applies to: 107-117 🏁 Script executed: #!/usr/bin/env bash
set -e
# 1. Show import lines to determine how `run` is imported
sed -n '1,30p' examples/nemo_run/common/utils.py
# 2. Search repository for a local SlurmExecutor definition
rg -nP --type=py "class\s+SlurmExecutor" -n -C3 .
# 3. If not found locally, install the package and introspect its signature
pip install nemo-run
python3 - << 'PYCODE'
import inspect
try:
from nemo_run.common import run
except ImportError:
import run
sig = inspect.signature(run.SlurmExecutor)
print("SlurmExecutor signature:", sig)
PYCODE Length of output: 1194 🏁 Script executed: #!/usr/bin/env bash
set -e
# 1. Confirm no local SlurmExecutor in repo
rg -nP --type=py "class\s+SlurmExecutor" -n -C3 .
# 2. Install nemo-run and inspect SlurmExecutor signature
pip install nemo-run
python3 - << 'PYCODE'
import inspect
import nemo_run as run
sig = inspect.signature(run.SlurmExecutor)
print("Signature:", sig)
print("Parameters:", list(sig.parameters.keys()))
PYCODE Length of output: 60 🏁 Script executed: #!/usr/bin/env bash
# Re-verify SlurmExecutor existence and signature without exiting on errors.
# 1. Check for local SlurmExecutor; ignore if none found.
rg -nP --type=py "class\s+SlurmExecutor" -n -C3 . || echo "No local SlurmExecutor definition in repo."
# 2. Install nemo-run package (suppressing verbose output).
pip install nemo-run >/dev/null 2>&1
# 3. Introspect the SlurmExecutor constructor signature and doc.
python3 - << 'PYCODE'
import inspect
import nemo_run as run
sig = inspect.signature(run.SlurmExecutor)
params = sig.parameters
print("Signature:", sig)
print("Parameter names:", list(params.keys()))
# Print each parameter with default if any
for name, param in params.items():
print(f" - {name}: default={param.default!r}")
PYCODE Length of output: 4059 🏁 Script executed: #!/usr/bin/env bash
set -e
# Locate SlurmConfig definition
grep -R "class SlurmConfig" -n examples/nemo_run/common/utils.py
# Display SlurmConfig fields
sed -n '1,200p' examples/nemo_run/common/utils.py | sed -n '/class SlurmConfig/,/^$/p'
# Search for env_vars usage in the file
grep -R "env_vars" -n examples/nemo_run/common/utils.py Length of output: 427 Pass 🤖 Prompt for AI Agents
|
||||||||||||||||||||||||||||||
return run.SlurmExecutor( | ||||||||||||||||||||||||||||||
account=slurm_cfg.account, | ||||||||||||||||||||||||||||||
partition=slurm_cfg.partition_cpu, | ||||||||||||||||||||||||||||||
nodes=nodes, | ||||||||||||||||||||||||||||||
tunnel=tunnel, | ||||||||||||||||||||||||||||||
container_image=slurm_cfg.container_image, | ||||||||||||||||||||||||||||||
container_mounts=slurm_cfg.container_mounts, | ||||||||||||||||||||||||||||||
time=slurm_cfg.time, | ||||||||||||||||||||||||||||||
packager=run.GitArchivePackager(), | ||||||||||||||||||||||||||||||
mem="0", | ||||||||||||||||||||||||||||||
) | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
def get_finetune_recipe(recipe_name: str): | ||||||||||||||||||||||||||||||
if not hasattr(getattr(llm, recipe_name), "finetune_recipe"): | ||||||||||||||||||||||||||||||
raise ValueError(f"Recipe {recipe_name} does not have a Fine-Tuning recipe") | ||||||||||||||||||||||||||||||
return getattr(llm, recipe_name).finetune_recipe(peft_scheme=None) | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
Comment on lines
+120
to
+124
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fix AttributeError risk when recipe module is missing.
-def get_finetune_recipe(recipe_name: str):
- if not hasattr(getattr(llm, recipe_name), "finetune_recipe"):
- raise ValueError(f"Recipe {recipe_name} does not have a Fine-Tuning recipe")
- return getattr(llm, recipe_name).finetune_recipe(peft_scheme=None)
+def get_finetune_recipe(recipe_name: str):
+ recipe_mod = getattr(llm, recipe_name, None)
+ if recipe_mod is None or not hasattr(recipe_mod, "finetune_recipe"):
+ raise ValueError(f"Recipe {recipe_name} does not exist or lacks a fine-tuning recipe")
+ return recipe_mod.finetune_recipe(peft_scheme=None) 📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents
|
||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
def read_chat_template(template_path: str): | ||||||||||||||||||||||||||||||
with open(template_path) as f: | ||||||||||||||||||||||||||||||
return f.read().strip() | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
def download_hf_dataset(dataset_name: str, output_dir: str | None = None): | ||||||||||||||||||||||||||||||
"""Download a dataset from HuggingFace Hub using huggingface-cli.""" | ||||||||||||||||||||||||||||||
cmd = ["huggingface-cli", "download", dataset_name, "--repo-type", "dataset"] | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
if output_dir: | ||||||||||||||||||||||||||||||
cmd.extend(["--local-dir", output_dir]) | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
subprocess.run(cmd, check=True) | ||||||||||||||||||||||||||||||
print(f"Successfully downloaded dataset: {dataset_name}") |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
# NeMo QAT/QAD Flow: Advanced Topics | ||
|
||
If you need to run QAT/QAD on a Slurm cluster (for example to use more than 1 node), this guide covers how to configure and launch on Slurm. | ||
|
||
To run the example on slurm, edit the `SLURM_CONFIG` at the bottom of `nemo_qat_flow.py` with the appropriate credentials, container, cluster name (host), and container mounts. Make sure you are mounting the NeMo and Megatron-LM repositories above in the Slurm cluster and that you've checked out the correct commits. | ||
jenchen13 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
## Running the Flow on Slurm | ||
|
||
To launch the Flow on a Slurm cluster, modify your Slurm credentials at the bottom of `nemo_qat_flow.py` and add the `--use-slurm` flag to the command. On a different server (e.g. your local server), launch the NeMo container as described in the [README](README.md) then run `python qat/nemo_qat_flow.py --use-slurm --log-dir /slurm/log/dir`, which will `ssh` into the Slurm cluster, `rsync` your files over, and launch the tasks. The log directory on the Slurm cluster should look like this after an experiment is run (assuming your experiment name is `qat_flow_ckpts`) | ||
|
||
```bash | ||
qat_flow_ckpts qat_flow_ckpts_1755708286 | ||
``` | ||
|
||
If you `cd` into the experiment itself, e.g. `cd qat_flow_ckpts_1755708286`, you'll find a directory structure like the following. Each folder is for a stage of the Simplified Flow, and in each stage you can see the logs for that stage as well as the sbatch command that was run. You can `cd` into each stage and `tail -f` the log file to see the logs while the stage is running. | ||
|
||
```bash | ||
├── 00_openscience_data | ||
│ ├── code | ||
│ ├── configs | ||
│ ├── log-coreai_dlalgo_modelopt-modelopt.00_openscience_data_5345664_0.out | ||
│ └── sbatch_coreai_dlalgo_modelopt-modelopt.00_openscience_data_5345664.out | ||
├── 01_import_model | ||
│ ├── code | ||
│ ├── configs | ||
│ ├── log-coreai_dlalgo_modelopt-modelopt.01_import_model_5345665_0.out | ||
│ └── sbatch_coreai_dlalgo_modelopt-modelopt.01_import_model_5345665.out | ||
├── 02_mmlu_bf16 | ||
│ ├── code | ||
│ ├── configs | ||
│ ├── log-coreai_dlalgo_modelopt-modelopt.02_mmlu_bf16_5345666_0.out | ||
│ └── sbatch_coreai_dlalgo_modelopt-modelopt.02_mmlu_bf16_5345666.out | ||
├── 03_ptq | ||
│ ├── code | ||
│ ├── configs | ||
│ ├── log-coreai_dlalgo_modelopt-modelopt.03_ptq_5345667_0.out | ||
│ └── sbatch_coreai_dlalgo_modelopt-modelopt.03_ptq_5345667.out | ||
├── 04_mmlu_ptq | ||
│ ├── code | ||
│ ├── configs | ||
│ ├── log-coreai_dlalgo_modelopt-modelopt.04_mmlu_ptq_5345668_0.out | ||
│ └── sbatch_coreai_dlalgo_modelopt-modelopt.04_mmlu_ptq_5345668.out | ||
├── 05_train | ||
│ ├── code | ||
│ ├── configs | ||
│ ├── log-coreai_dlalgo_modelopt-modelopt.05_train_5345669_0.out | ||
│ └── sbatch_coreai_dlalgo_modelopt-modelopt.05_train_5345669.out | ||
├── 06_mmlu_sft | ||
│ ├── code | ||
│ └── configs | ||
├── 07_export_hf | ||
│ ├── code | ||
│ └── configs | ||
``` | ||
|
||
**NOTE:** `rsync` may not currently be available in the NeMo container and will be added as a dependency. |
Uh oh!
There was an error while loading. Please reload this page.