Skip to content

Commit 163b148

Browse files
authored
Merge branch 'main' into vjawa/fix_path
2 parents 30d2745 + b83c9ff commit 163b148

File tree

12 files changed

+1755
-112
lines changed

12 files changed

+1755
-112
lines changed

.github/workflows/cicd-main.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ jobs:
9292
- name: Run tests ${{ matrix.folder }} (CPU)
9393
timeout-minutes: 40
9494
run: |
95-
uv sync --link-mode copy --locked --extra audio_cpu --extra text_cpu --extra video_cpu --group test
95+
uv sync --link-mode copy --locked --extra audio_cpu --extra sdg_cpu --extra text_cpu --extra video_cpu --group test
9696
FOLDER="${{ matrix.folder }}"
9797
FOLDER="${FOLDER/stages-/stages/}"
9898
uv run coverage run --branch --source=nemo_curator -m pytest -v "tests/$FOLDER" -m "not gpu"

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ repos:
3636
- id: requirements-txt-fixer
3737
- id: trailing-whitespace
3838
- repo: https://github.com/astral-sh/ruff-pre-commit
39-
rev: v0.11.4
39+
rev: v0.14.10
4040
hooks:
4141
# Run the linter
4242
- id: ruff

benchmarking/nightly-benchmark.yaml

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,10 @@ datasets:
5959
formats:
6060
- type: "bin"
6161
path: "{model_weights_path}/fasttext/lid.176.bin"
62-
62+
- name: "gretel_symptoms"
63+
formats:
64+
- type: "jsonl"
65+
path: "{datasets_path}/gretel_symptoms"
6366
default_timeout_s: 7200
6467

6568
# Optional sinks
@@ -470,7 +473,7 @@ entries:
470473
exact_value: 3800
471474
- metric: throughput_images_per_sec
472475
min_value: 3.0
473-
476+
474477
- name: audio_fleurs
475478
enabled: true
476479
script: audio_fleurs_benchmark.py
@@ -561,7 +564,7 @@ entries:
561564
exact_value: 1400
562565
- metric: throughput_clips_per_sec
563566
min_value: 4.0
564-
567+
565568
- name: video_transcoding
566569
enabled: true
567570
script: video_pipeline_benchmark.py
@@ -609,6 +612,31 @@ entries:
609612
- metric: throughput_clips_per_sec
610613
min_value: 0.25
611614

615+
- name: ndd_nvidia_nim
616+
enabled: true
617+
script: ndd_benchmark.py
618+
args: >-
619+
--benchmark-results-path={session_entry_dir}
620+
--input-path={dataset:gretel_symptoms,jsonl}
621+
--output-path={session_entry_dir}/scratch/output
622+
--model-type=nvidia-nim
623+
--model-id=openai/gpt-oss-20b
624+
--executor=ray_data
625+
timeout_s: 3600
626+
ray:
627+
num_cpus: 8
628+
num_gpus: 0
629+
enable_object_spilling: false
630+
sink_data:
631+
- name: slack
632+
additional_metrics:
633+
- input_row_count
634+
- output_row_count
635+
- throughput_rows_per_sec
636+
requirements:
637+
- metric: output_row_count
638+
exact_value: 820
639+
612640
- name: video_transnetv2_motion_aesthetic_filter_embeddings
613641
enabled: true
614642
script: video_pipeline_benchmark.py
Lines changed: 268 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,268 @@
1+
# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# ruff: noqa: PLR0913
16+
17+
"""NeMo Data Designer (NDD) benchmarking script.
18+
19+
Benchmarks synthetic data generation via NDD using the NVIDIA NIM cloud API.
20+
21+
Usage from the benchmarking orchestrator (run.py) -- see ndd.yaml for the
22+
full configuration. Can also be run standalone:
23+
24+
python ndd_benchmark.py \
25+
--benchmark-results-path /tmp/results \
26+
--input-path ./data/ndd \
27+
--output-path /tmp/ndd_output \
28+
--model-type nvidia-nim \
29+
--model-id openai/gpt-oss-20b \
30+
--executor ray_data
31+
"""
32+
33+
import argparse
34+
import os
35+
import time
36+
from pathlib import Path
37+
from typing import Any
38+
39+
import data_designer.config as dd
40+
from loguru import logger
41+
from utils import setup_executor, write_benchmark_results
42+
43+
from nemo_curator.pipeline import Pipeline
44+
from nemo_curator.stages.synthetic.nemo_data_designer.data_designer import DataDesignerStage
45+
from nemo_curator.stages.text.io.reader.jsonl import JsonlReader
46+
from nemo_curator.stages.text.io.writer.jsonl import JsonlWriter
47+
from nemo_curator.tasks.utils import TaskPerfUtils
48+
from nemo_curator.utils.file_utils import get_all_file_paths_under
49+
50+
# ---------------------------------------------------------------------------
51+
# Data Designer config builder
52+
# ---------------------------------------------------------------------------
53+
54+
55+
def _build_config(model_id: str) -> dd.DataDesignerConfigBuilder:
56+
"""Build the DataDesigner config for the medical-notes generation task."""
57+
model_alias = model_id
58+
59+
model_configs = [
60+
dd.ModelConfig(
61+
alias=model_alias,
62+
model=model_id,
63+
provider="nvidia",
64+
skip_health_check=False,
65+
inference_parameters=dd.ChatCompletionInferenceParams(
66+
temperature=1.0,
67+
top_p=1.0,
68+
max_tokens=2048,
69+
),
70+
),
71+
]
72+
73+
config_builder = dd.DataDesignerConfigBuilder(model_configs=model_configs)
74+
75+
# -- Sampler columns ------------------------------------------------
76+
config_builder.add_column(
77+
dd.SamplerColumnConfig(
78+
name="patient_sampler",
79+
sampler_type=dd.SamplerType.PERSON_FROM_FAKER,
80+
params=dd.PersonFromFakerSamplerParams(),
81+
)
82+
)
83+
config_builder.add_column(
84+
dd.SamplerColumnConfig(
85+
name="doctor_sampler",
86+
sampler_type=dd.SamplerType.PERSON_FROM_FAKER,
87+
params=dd.PersonFromFakerSamplerParams(),
88+
)
89+
)
90+
config_builder.add_column(
91+
dd.SamplerColumnConfig(
92+
name="patient_id",
93+
sampler_type=dd.SamplerType.UUID,
94+
params=dd.UUIDSamplerParams(prefix="PT-", short_form=True, uppercase=True),
95+
)
96+
)
97+
98+
# -- Expression columns ---------------------------------------------
99+
config_builder.add_column(dd.ExpressionColumnConfig(name="first_name", expr="{{ patient_sampler.first_name}}"))
100+
config_builder.add_column(dd.ExpressionColumnConfig(name="last_name", expr="{{ patient_sampler.last_name }}"))
101+
config_builder.add_column(dd.ExpressionColumnConfig(name="dob", expr="{{ patient_sampler.birth_date }}"))
102+
config_builder.add_column(
103+
dd.SamplerColumnConfig(
104+
name="symptom_onset_date",
105+
sampler_type=dd.SamplerType.DATETIME,
106+
params=dd.DatetimeSamplerParams(start="2024-01-01", end="2024-12-31"),
107+
)
108+
)
109+
config_builder.add_column(
110+
dd.SamplerColumnConfig(
111+
name="date_of_visit",
112+
sampler_type=dd.SamplerType.TIMEDELTA,
113+
params=dd.TimeDeltaSamplerParams(dt_min=1, dt_max=30, reference_column_name="symptom_onset_date"),
114+
)
115+
)
116+
config_builder.add_column(dd.ExpressionColumnConfig(name="physician", expr="Dr. {{ doctor_sampler.last_name }}"))
117+
118+
# -- LLM column -----------------------------------------------------
119+
config_builder.add_column(
120+
dd.LLMTextColumnConfig(
121+
name="physician_notes",
122+
prompt="""\
123+
You are a primary-care physician who just had an appointment with {{ first_name }} {{ last_name }},
124+
who has been struggling with symptoms from {{ diagnosis }} since {{ symptom_onset_date }}.
125+
The date of today's visit is {{ date_of_visit }}.
126+
127+
{{ patient_summary }}
128+
129+
Write careful notes about your visit with {{ first_name }},
130+
as Dr. {{ doctor_sampler.first_name }} {{ doctor_sampler.last_name }}.
131+
132+
Format the notes as a busy doctor might.
133+
Respond with only the notes, no other text.
134+
""",
135+
model_alias=model_alias,
136+
)
137+
)
138+
139+
return config_builder
140+
141+
142+
# ---------------------------------------------------------------------------
143+
# Benchmark runner
144+
# ---------------------------------------------------------------------------
145+
146+
147+
def run_ndd_benchmark(
148+
model_type: str,
149+
model_id: str,
150+
input_path: str,
151+
output_path: str,
152+
executor: str,
153+
num_files: int | None,
154+
**kwargs, # noqa: ARG001
155+
) -> dict[str, Any]:
156+
"""Run the NDD benchmark and collect metrics."""
157+
input_path = Path(input_path)
158+
output_path = Path(output_path).absolute()
159+
output_path.mkdir(parents=True, exist_ok=True)
160+
161+
logger.info(f"Model type: {model_type}")
162+
logger.info(f"Model ID: {model_id}")
163+
logger.info(f"Input path: {input_path}")
164+
logger.info(f"Output path: {output_path}")
165+
logger.info(f"Executor: {executor}")
166+
167+
# Resolve input files using Curator utility
168+
input_files = get_all_file_paths_under(str(input_path), keep_extensions="jsonl")
169+
if num_files is not None and num_files > 0:
170+
logger.info(f"Using {num_files} of {len(input_files)} input files")
171+
input_files = input_files[:num_files]
172+
173+
# -- Environment setup: nvidia-nim requires NVIDIA_API_KEY ----------
174+
if not os.environ.get("NVIDIA_API_KEY"):
175+
msg = "NVIDIA_API_KEY must be set for nvidia-nim model type"
176+
raise OSError(msg)
177+
178+
# -- Build config and run pipeline ----------------------------------
179+
config_builder = _build_config(model_id)
180+
181+
executor_obj = setup_executor(executor)
182+
183+
pipeline = Pipeline(
184+
name="ndd_benchmark_pipeline",
185+
stages=[
186+
JsonlReader(file_paths=input_files, fields=["diagnosis", "patient_summary"]),
187+
DataDesignerStage(config_builder=config_builder),
188+
JsonlWriter(path=str(output_path)),
189+
],
190+
)
191+
192+
logger.info("Starting NDD pipeline...")
193+
run_start_time = time.perf_counter()
194+
output_tasks = pipeline.run(executor_obj)
195+
run_time_taken = time.perf_counter() - run_start_time
196+
197+
# -- Post-run: extract metrics from _stage_perf ----------------------
198+
input_row_count = int(TaskPerfUtils.get_aggregated_stage_stat(output_tasks, "DataDesignerStage", "custom.num_input_records"))
199+
output_row_count = int(TaskPerfUtils.get_aggregated_stage_stat(output_tasks, "DataDesignerStage", "custom.num_output_records"))
200+
input_tokens_median_per_record = float(TaskPerfUtils.get_aggregated_stage_stat(output_tasks, "DataDesignerStage", "custom.input_tokens_median_per_record"))
201+
output_tokens_median_per_record = float(TaskPerfUtils.get_aggregated_stage_stat(output_tasks, "DataDesignerStage", "custom.output_tokens_median_per_record"))
202+
throughput_rows_per_sec = output_row_count / run_time_taken if run_time_taken > 0 else 0
203+
204+
logger.success(f"NDD benchmark completed in {run_time_taken:.2f}s")
205+
logger.success(f"Input: {input_row_count} rows")
206+
logger.success(f"Output: {output_row_count} rows")
207+
logger.success(f"Input tokens median per record: {input_tokens_median_per_record:,}")
208+
logger.success(f"Output tokens median per record: {output_tokens_median_per_record:,}")
209+
logger.success(f"Throughput: {throughput_rows_per_sec:.2f} rows/sec")
210+
211+
return {
212+
"metrics": {
213+
"is_success": True,
214+
"time_taken_s": run_time_taken,
215+
"model_type": model_type,
216+
"model_id": model_id,
217+
"input_row_count": input_row_count,
218+
"output_row_count": output_row_count,
219+
"input_tokens_median_per_record": input_tokens_median_per_record,
220+
"output_tokens_median_per_record": output_tokens_median_per_record,
221+
"throughput_rows_per_sec": throughput_rows_per_sec,
222+
"num_files": num_files or "all",
223+
},
224+
"tasks": output_tasks,
225+
}
226+
227+
228+
# ---------------------------------------------------------------------------
229+
# CLI
230+
# ---------------------------------------------------------------------------
231+
232+
233+
def main() -> int:
234+
parser = argparse.ArgumentParser(description="NeMo Data Designer (NDD) benchmark")
235+
parser.add_argument("--benchmark-results-path", required=True, help="Path to write benchmark results")
236+
parser.add_argument("--input-path", required=True, help="Path to input JSONL seed data")
237+
parser.add_argument("--output-path", required=True, help="Path to write generated output")
238+
parser.add_argument(
239+
"--model-type",
240+
required=True,
241+
choices=["nvidia-nim"],
242+
help="Model serving backend",
243+
)
244+
parser.add_argument("--model-id", default="openai/gpt-oss-20b", help="Model identifier")
245+
parser.add_argument("--executor", default="ray_data", choices=["ray_data", "xenna"], help="Pipeline executor")
246+
parser.add_argument("--num-files", type=int, default=None, help="Limit number of input files (default: all)")
247+
248+
args = parser.parse_args()
249+
250+
logger.info("=== NDD Benchmark Starting ===")
251+
logger.info(f"Arguments: {vars(args)}")
252+
253+
success_code = 1
254+
result_dict: dict[str, Any] = {
255+
"params": vars(args),
256+
"metrics": {"is_success": False},
257+
"tasks": [],
258+
}
259+
try:
260+
result_dict.update(run_ndd_benchmark(**vars(args)))
261+
success_code = 0 if result_dict["metrics"]["is_success"] else 1
262+
finally:
263+
write_benchmark_results(result_dict, args.benchmark_results_path)
264+
return success_code
265+
266+
267+
if __name__ == "__main__":
268+
raise SystemExit(main())

benchmarking/tools/run.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ MLFLOW_TRACKING_URI=${MLFLOW_TRACKING_URI:-""}
2323
SLACK_WEBHOOK_URL=${SLACK_WEBHOOK_URL:-""}
2424
GDRIVE_FOLDER_ID=${GDRIVE_FOLDER_ID:-""}
2525
GDRIVE_SERVICE_ACCOUNT_FILE=${GDRIVE_SERVICE_ACCOUNT_FILE:-""}
26+
NVIDIA_API_KEY=${NVIDIA_API_KEY:-""}
2627

2728
# get the following vars from the command line, config file(s), etc. and
2829
# set them in this environment:
@@ -68,6 +69,7 @@ docker run \
6869
--env=GDRIVE_SERVICE_ACCOUNT_FILE=${GDRIVE_SERVICE_ACCOUNT_FILE} \
6970
--env=CURATOR_BENCHMARKING_DEBUG=${CURATOR_BENCHMARKING_DEBUG} \
7071
--env=HOST_HOSTNAME=$(hostname) \
72+
--env=NVIDIA_API_KEY=${NVIDIA_API_KEY} \
7173
\
7274
${BASH_ENTRYPOINT_OVERRIDE} \
7375
${CURATOR_BENCHMARKING_IMAGE} \
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .data_designer import DataDesignerStage
2+
3+
__all__ = ["DataDesignerStage"]

0 commit comments

Comments
 (0)