Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,16 +78,16 @@ deploy(
### 2. Evaluate the Model

```python
from nemo_eval.api import evaluate
from nemo_eval.utils.api import EvaluationTarget, EvaluationConfig, ApiEndpoint
from nvidia_eval_commons.core.evaluate import evaluate
from nvidia_eval_commons.api.api_dataclasses import ApiEndpoint, EvaluationConfig, EvaluationTarget

# Configure evaluation
api_endpoint = ApiEndpoint(
url="http://0.0.0.0:8080/v1/completions/",
model_id="megatron_model"
)
target = EvaluationTarget(api_endpoint=api_endpoint)
config = EvaluationConfig(type="gsm8k")
config = EvaluationConfig(type="gsm8k", output_dir="results")

# Run evaluation
results = evaluate(target_cfg=target, eval_cfg=config)
Expand Down Expand Up @@ -140,16 +140,18 @@ deploy(
### Basic Evaluation

```Python
from nemo_eval.api import evaluate
from nemo_eval.utils.api import EvaluationTarget, EvaluationConfig, ApiEndpoint, ConfigParams
from nvidia_eval_commons.core.evaluate import evaluate
from nvidia_eval_commons.api.api_dataclasses import ApiEndpoint, ConfigParams, EvaluationConfig, EvaluationTarget
# Configure Endpoint
api_endpoint = ApiEndpoint(
url="http://0.0.0.0:8080/v1/completions/",
model_id="megatron_model"
)
# Evaluation target configuration
target = EvaluationTarget(api_endpoint=api_endpoint)
# Configure EvaluationConfig with type, number of samples to evaluate on, etc.
config = EvaluationConfig(type="gsm8k",
output_dir="results",
params=ConfigParams(
limit_samples=10
))
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,15 @@ classifiers = [
"Topic :: Utilities",
]
dependencies = [
"nvidia-lm-eval==25.6.1",
"nvidia-lm-eval==25.7.1",
"uvicorn",
"flask",
"megatron-core>=0.13.0a0,<0.14.0",
"nvidia-modelopt[torch,onnx]>=0.31.0a0,<0.32.0; sys_platform != 'darwin'",
"nvidia-resiliency-ext>=0.3.0a0,<0.4.0; sys_platform != 'darwin'",
"nemo-export-deploy>=0.1.0a0,<0.2.0",
"pandas>2.0.0",
"nvidia-eval-commons~=1.0.0",
]

[project.optional-dependencies]
Expand Down
5 changes: 3 additions & 2 deletions scripts/evaluation_with_nemo_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,10 @@
from typing import Optional

import nemo_run as run
from nvidia_eval_commons.api.api_dataclasses import ApiEndpoint, ConfigParams, EvaluationConfig, EvaluationTarget
from nvidia_eval_commons.core.evaluate import evaluate

from nemo_eval.api import deploy, evaluate
from nemo_eval.utils.api import ApiEndpoint, ConfigParams, EvaluationConfig, EvaluationTarget
from nemo_eval.api import deploy

ENDPOINT_TYPES = {"chat": "chat/completions/", "completions": "completions/"}

Expand Down
26 changes: 14 additions & 12 deletions scripts/snippets/arc_challenge.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,35 +12,37 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# pip install nvidia-lm-eval==25.6
# pip install nvidia-lm-eval==25.7.1

## Run the evaluation
from nemo_eval.api import evaluate
from nemo_eval.utils.api import EvaluationConfig, EvaluationTarget
from nvidia_eval_commons.api.api_dataclasses import (
ApiEndpoint,
ConfigParams,
EndpointType,
EvaluationConfig,
EvaluationTarget,
)
from nvidia_eval_commons.core.evaluate import evaluate

model_name = "megatron_model"
completions_url = "http://0.0.0.0:8080/v1/completions/"


target_config = EvaluationTarget(
api_endpoint={
"url": completions_url,
"type": "completions",
}
api_endpoint=ApiEndpoint(url=completions_url, type=EndpointType.COMPLETIONS, model_id=model_name)
)
eval_config = EvaluationConfig(
type="arc_challenge",
output_dir="/results/",
params={
"limit_samples": 10,
"extra": {
params=ConfigParams(
limit_samples=10,
extra={
"tokenizer": "/checkpoints/llama-3_2-1b-instruct_v2.0/context/nemo_tokenizer",
"tokenizer_backend": "huggingface",
},
},
),
)


results = evaluate(target_cfg=target_config, eval_cfg=eval_config)


Expand Down
22 changes: 12 additions & 10 deletions scripts/snippets/bfcl.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,26 +12,28 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# pip install nvidia-bfcl==25.6
# pip install nvidia-bfcl==25.7.1

## Export the required variables
# No environment variables are required
## Run the evaluation
from nemo_eval.api import evaluate
from nemo_eval.utils.api import EvaluationConfig, EvaluationTarget
from nvidia_eval_commons.api.api_dataclasses import (
ApiEndpoint,
ConfigParams,
EndpointType,
EvaluationConfig,
EvaluationTarget,
)
from nvidia_eval_commons.core.evaluate import evaluate

model_name = "megatron_model"
chat_url = "http://0.0.0.0:8080/v1/chat/completions/"


target_config = EvaluationTarget(
api_endpoint={
"url": chat_url,
"type": "chat",
}
target_config = EvaluationTarget(api_endpoint=ApiEndpoint(url=chat_url, type=EndpointType.CHAT, model_id=model_name))
eval_config = EvaluationConfig(
type="bfclv3_ast_prompting", output_dir="/results/", params=ConfigParams(limit_samples=10)
)
eval_config = EvaluationConfig(type="bfclv3_ast", output_dir="/results/", params={"limit_samples": 10})


results = evaluate(target_cfg=target_config, eval_cfg=eval_config)

Expand Down
21 changes: 11 additions & 10 deletions scripts/snippets/bigcode.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,26 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# pip install nvidia-bigcode-eval==25.6
# pip install nvidia-bigcode-eval==25.7.1

## Export the required variables
# No environment variables are required
## Run the evaluation
from nemo_eval.api import evaluate
from nemo_eval.utils.api import EvaluationConfig, EvaluationTarget
from nvidia_eval_commons.api.api_dataclasses import (
ApiEndpoint,
ConfigParams,
EndpointType,
EvaluationConfig,
EvaluationTarget,
)
from nvidia_eval_commons.core.evaluate import evaluate

model_name = "megatron_model"
chat_url = "http://0.0.0.0:8080/v1/chat/completions/"


target_config = EvaluationTarget(
api_endpoint={
"url": chat_url,
"type": "chat",
}
)
eval_config = EvaluationConfig(type="mbpp", output_dir="/results/", params={"limit_samples": 10})
target_config = EvaluationTarget(api_endpoint=ApiEndpoint(url=chat_url, type=EndpointType.CHAT, model_id=model_name))
eval_config = EvaluationConfig(type="mbpp", output_dir="/results/", params=ConfigParams(limit_samples=10))


results = evaluate(target_cfg=target_config, eval_cfg=eval_config)
Expand Down
22 changes: 11 additions & 11 deletions scripts/snippets/garak.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,28 +12,28 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# pip install nvidia-eval-factory-garak==25.6
# pip install nvidia-eval-factory-garak==25.7.1

## Export the required variables
# No environment variables are required
## Run the evaluation
from nemo_eval.api import evaluate
from nemo_eval.utils.api import EvaluationConfig, EvaluationTarget
from nvidia_eval_commons.api.api_dataclasses import (
ApiEndpoint,
ConfigParams,
EndpointType,
EvaluationConfig,
EvaluationTarget,
)
from nvidia_eval_commons.core.evaluate import evaluate

model_name = "megatron_model"
chat_url = "http://0.0.0.0:8080/v1/chat/completions/"


target_config = EvaluationTarget(
api_endpoint={
"url": chat_url,
"type": "chat",
}
)
target_config = EvaluationTarget(api_endpoint=ApiEndpoint(url=chat_url, type=EndpointType.CHAT, model_id=model_name))
eval_config = EvaluationConfig(
type="garak",
output_dir="/results/",
params={"extra": {"probes": "ansiescape.AnsiEscaped"}},
params=ConfigParams(limit_samples=10, extra={"probes": "ansiescape.AnsiEscaped"}),
)


Expand Down
26 changes: 15 additions & 11 deletions scripts/snippets/lambada.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,32 +12,36 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# pip install nvidia-lm-eval==25.6
# pip install nvidia-lm-eval==25.7.1

## Run the evaluation
from nemo_eval.api import evaluate
from nemo_eval.utils.api import EvaluationConfig, EvaluationTarget
from nvidia_eval_commons.api.api_dataclasses import (
ApiEndpoint,
ConfigParams,
EndpointType,
EvaluationConfig,
EvaluationTarget,
)
from nvidia_eval_commons.core.evaluate import evaluate

model_name = "megatron_model"
completions_url = "http://0.0.0.0:8080/v1/completions/"


target_config = EvaluationTarget(
api_endpoint={
"url": completions_url,
"type": "completions",
}
api_endpoint=ApiEndpoint(url=completions_url, type=EndpointType.COMPLETIONS, model_id=model_name)
)

eval_config = EvaluationConfig(
type="lm-evaluation-harness.lambada_openai",
output_dir="/results/",
params={
"limit_samples": 10,
"extra": {
params=ConfigParams(
limit_samples=10,
extra={
"tokenizer": "/checkpoints/llama-3_2-1b-instruct_v2.0/context/nemo_tokenizer",
"tokenizer_backend": "huggingface",
},
},
),
)


Expand Down
27 changes: 14 additions & 13 deletions scripts/snippets/safety.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,38 +12,39 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# pip install nvidia-safety-harness==25.6
# pip install nvidia-safety-harness==25.7.1

## Export the required variables
## Key with access to https://build.nvidia.com/ endpoints
# export JUDGE_API_KEY=...
# export HF_TOKEN=...
## Run the evaluation
from nemo_eval.api import evaluate
from nemo_eval.utils.api import EvaluationConfig, EvaluationTarget
from nvidia_eval_commons.api.api_dataclasses import (
ApiEndpoint,
ConfigParams,
EndpointType,
EvaluationConfig,
EvaluationTarget,
)
from nvidia_eval_commons.core.evaluate import evaluate

model_name = "megatron_model"
chat_url = "http://0.0.0.0:8080/v1/chat/completions/"


target_config = EvaluationTarget(
api_endpoint={
"url": chat_url,
"type": "chat",
}
)
target_config = EvaluationTarget(api_endpoint=ApiEndpoint(url=chat_url, type=EndpointType.CHAT, model_id=model_name))
eval_config = EvaluationConfig(
type="aegis_v2",
output_dir="/results/",
params={
"limit_samples": 10,
"extra": {
params=ConfigParams(
limit_samples=10,
extra={
"judge": {
"model_id": "llama-nemotron-safety-guard-v2",
"url": "http://0.0.0.0:9000/v1/completions",
}
},
},
),
)


Expand Down
26 changes: 11 additions & 15 deletions scripts/snippets/simple_evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,32 +12,28 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# pip install nvidia-simple-evals==25.6
# pip install nvidia-simple-evals==25.7.1

## Export the required variables
## Key with access to https://build.nvidia.com/ endpoints
# export JUDGE_API_KEY=...
## Run the evaluation
from nemo_eval.api import evaluate
from nemo_eval.utils.api import EvaluationConfig, EvaluationTarget
from nvidia_eval_commons.api.api_dataclasses import (
ApiEndpoint,
ConfigParams,
EndpointType,
EvaluationConfig,
EvaluationTarget,
)
from nvidia_eval_commons.core.evaluate import evaluate

model_name = "megatron_model"
chat_url = "http://0.0.0.0:8080/v1/chat/completions/"


target_config = EvaluationTarget(
api_endpoint={
"url": chat_url,
"type": "chat",
}
)
eval_config = EvaluationConfig(
type="AIME_2025",
output_dir="/results/",
params={"limit_samples": 10},
)

target_config = EvaluationTarget(api_endpoint=ApiEndpoint(url=chat_url, type=EndpointType.CHAT, model_id=model_name))

eval_config = EvaluationConfig(type="AIME_2025", output_dir="/results/", params=ConfigParams(limit_samples=10))
results = evaluate(target_cfg=target_config, eval_cfg=eval_config)


Expand Down
13 changes: 0 additions & 13 deletions src/nemo_eval/adapters/__init__.py

This file was deleted.

Loading
Loading