Skip to content

Commit b4dbaac

Browse files
committed
add nemo run export
Signed-off-by: Jennifer Chen <[email protected]>
1 parent 34d5276 commit b4dbaac

File tree

2 files changed

+71
-38
lines changed

2 files changed

+71
-38
lines changed

examples/nemo_run/common/process_openscience.py

Lines changed: 0 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -45,49 +45,12 @@ def process_subset(raw_dir, proc_dir):
4545
split_ds["test"].to_json(os.path.join(proc_dir, "validation.jsonl"))
4646

4747

48-
# TODO remove below?
49-
def sample_openscience(raw_dir, proc_dir, sample_ratio=1):
50-
"""Process raw OpenScience data by subsampling the dataset by default, then
51-
writing into train/val split with 99/1 ratio"""
52-
files = os.listdir(raw_dir)
53-
num_data = 0
54-
55-
for file in files:
56-
# Open each jsonl
57-
if file.endswith("jsonl"):
58-
print(f"Sampling from {file}")
59-
with (
60-
open(os.path.join(raw_dir, file)) as f_raw,
61-
open(os.path.join(proc_dir, "training.jsonl"), "a") as f_train,
62-
open(os.path.join(proc_dir, "validation.jsonl"), "a") as f_val,
63-
):
64-
for idx, line in enumerate(f_raw):
65-
if idx % sample_ratio != 0:
66-
continue
67-
data = json.loads(line)
68-
# convert dictionary to OpenAI chat: from {"input": "...", "output": "..."}
69-
# to [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]
70-
data = {
71-
"messages": [
72-
{"role": "user", "content": data["input"]},
73-
{"role": "assistant", "content": data["output"]},
74-
]
75-
}
76-
77-
if num_data % 100 == 0:
78-
f_val.write(json.dumps(data) + "\n")
79-
else:
80-
f_train.write(json.dumps(data) + "\n")
81-
num_data += 1
82-
83-
8448
if __name__ == "__main__":
8549
args = get_parser().parse_args()
8650
raw_dir = f"{args.output_dir}/openscience_raw"
8751
proc_dir = f"{args.output_dir}/openscience_proc"
8852

8953
if not os.path.exists(raw_dir):
90-
# download_hf_dataset("nvidia/OpenScience", raw_dir)
9154
q235_subset = load_dataset("nvidia/OpenScience", data_files="OS-Q3-235B-4.jsonl")
9255
q235_subset.save_to_disk(raw_dir)
9356

@@ -97,4 +60,3 @@ def sample_openscience(raw_dir, proc_dir, sample_ratio=1):
9760
process_subset(raw_dir, proc_dir)
9861
else:
9962
print(f"Processed OpenScience dataset exists in: {proc_dir}, skipped processing")
100-
# process_openscience(raw_dir, proc_dir)
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""Export functions for NeMo Run."""
17+
18+
from pathlib import Path
19+
20+
from nemo.collections.llm.api import export_ckpt
21+
from nemo.utils import logging
22+
23+
24+
def export_most_recent_ckpt(directory: str, output_path: str):
25+
"""Export most recent checkpoint from a NeMo Run experiment directory."""
26+
most_recent_ckpt = _get_most_recent_ckpt(directory)
27+
modelopt_kwargs = {"export_extra_modules": True}
28+
logging.info(f"Exporting most recent NeMo Run checkpoint: {most_recent_ckpt}")
29+
export_ckpt(
30+
most_recent_ckpt,
31+
"hf",
32+
output_path=output_path,
33+
overwrite=True,
34+
modelopt_export_kwargs=modelopt_kwargs,
35+
)
36+
37+
38+
def _get_most_recent_subdir(directory: Path):
39+
# Get all subdirectories
40+
subdirs = [d for d in directory.iterdir() if d.is_dir()]
41+
if not subdirs:
42+
raise ValueError(f"No subdirectories found in {directory}")
43+
44+
# Sort by modification time (most recent first)
45+
most_recent = max(subdirs, key=lambda x: x.stat().st_mtime)
46+
47+
return most_recent
48+
49+
50+
def _get_most_recent_ckpt(directory: str):
51+
"""Find the most recent checkpoint subdirectory in a given NeMo Run experiment directory.
52+
53+
Args:
54+
directory (str): Path to the directory to search in.
55+
56+
Returns:
57+
str: Path to the most recent subdirectory.
58+
"""
59+
exp_dir = Path(directory) / "default"
60+
assert exp_dir.exists(), f"Experiment directory {exp_dir} does not exist"
61+
62+
checkpoint_dir = exp_dir / "checkpoints"
63+
if checkpoint_dir.exists():
64+
most_recent = _get_most_recent_subdir(checkpoint_dir)
65+
else:
66+
most_recent = _get_most_recent_subdir(exp_dir)
67+
checkpoint_dir = most_recent / "checkpoints"
68+
assert checkpoint_dir.exists(), f"Checkpoint directory {checkpoint_dir} does not exist"
69+
most_recent = _get_most_recent_subdir(checkpoint_dir)
70+
71+
return str(most_recent)

0 commit comments

Comments
 (0)