NVIDIA-NeMo · blahblahasdf · Dec 17, 2025 · Dec 17, 2025 · Dec 16, 2025 · Dec 16, 2025
diff --git a/docs/evaluation/code.md b/docs/evaluation/code.md
@@ -178,6 +178,65 @@ all you need to do is replace `openhands` with `swe_agent` in the command above.
 !!! note
     For evaluation, we use a [custom fork](https://github.com/Kipok/SWE-bench) of the SWE-bench repository that supports running evaluation inside of an existing container. It may not always have the latest updates from the upstream repo.
 
+### compute-eval
+
+- Benchmark is defined in [`nemo_skills/dataset/compute-eval/__init__.py`](https://github.com/NVIDIA-NeMo/Skills/blob/main/nemo_skills/dataset/compute-eval/__init__.py)
+- Original benchmark source is [here](https://github.com/NVIDIA/compute-eval).
+
+ComputeEval is a benchmark for evaluating Large Language Models on CUDA code generation tasks. It features handcrafted CUDA programming challenges that test an LLM's capability at writing reliable CUDA code. The benchmark includes functional correctness evaluation through compilation and execution against held-out test suites.
+
+**Prerequisites:** NVIDIA GPU with CUDA Toolkit 12 or greater must be installed, and `nvcc` must be available in your PATH.
+
+#### Data Preparation
+
+First, prepare the dataset by running the `ns prepare_data` command. You can optionally specify a release version:
+
+```bash
+ns prepare_data compute-eval --release 2025-1
+```
+
+If no release is specified, the default release will be downloaded. This will generate an `eval.jsonl` file in the `nemo_skills/dataset/compute-eval/` directory.
+
+**Note:** You need to set the `HF_TOKEN` environment variable because the dataset requires authentication.
+
+#### Running the Evaluation
+
+Once the data is prepared, you can run the evaluation. Replace `<...>` placeholders with your cluster and directory paths.
+
+This command runs an evaluation of [OpenReasoning-Nemotron-32B](https://huggingface.co/nvidia/OpenReasoning-Nemotron-32B) on a Slurm cluster:
+
+```bash
+ns eval \
+    --cluster=<CLUSTER_NAME> \
+    --model=nvidia/OpenReasoning-Nemotron-32B \
+    --server_type=vllm \
+    --server_args="--async-scheduling" \
+    --server_nodes=1 \
+    --server_gpus=8 \
+    --benchmarks=compute-eval \
+    --data_dir=<DATA_DIR> \
+    --output_dir=<OUTPUT_DIR> \
+    ++inference.temperature=0.6 \
+    ++inference.top_p=0.95 \
+    ++inference.tokens_to_generate=16384
+```
+
+**Security Note:** ComputeEval executes machine-generated CUDA code. While the benchmark is designed for evaluation purposes, we strongly recommend running evaluations in a sandboxed environment (e.g., a Docker container or virtual machine) to minimize security risks.
+
+#### Verifying Results
+
+After all jobs are complete, you can check the results in `<OUTPUT_DIR>/eval-results/compute-eval/metrics.json`. You can also review `<OUTPUT_DIR>/eval-results/compute-eval/summarized-results/main_*`. They should look something like this:
+
+```
+---------------------------- compute-eval -----------------------------
+evaluation_mode | num_entries | avg_tokens | gen_seconds | accuracy
+pass@1          | 50          | 8432       | 1245        | 64.00%
+```
+
+The benchmark reports:
+- **accuracy**: Percentage of problems where generated code compiled and passed all tests
+- **pass@1**: Same as accuracy for single-solution generation
+- **pass@k**: Success rate when generating k solutions per problem (if configured)
 
 ### IOI
 

diff --git a/nemo_skills/dataset/compute-eval/__init__.py b/nemo_skills/dataset/compute-eval/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+EVAL_SPLIT = "eval"
+DATASET_GROUP = "code"
+METRICS_TYPE = "compute-eval"
+GENERATION_MODULE = "nemo_skills.inference.eval.compute_eval"
+GENERATION_ARGS = "++prompt_config=compute-eval/baseline ++eval_type=compute-eval"
diff --git a/nemo_skills/dataset/compute-eval/prepare.py b/nemo_skills/dataset/compute-eval/prepare.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import json
+import os
+from pathlib import Path
+
+from datasets import load_dataset
+
+_CONTEXT_FILES_BLOCK_TEMPLATE = """
+--- file: {path}
+```{fence}
+{content}
+```
+"""
+
+
+def _fence_for_path(path: str) -> str:
+    p = path.lower()
+    if p.endswith((".cu", ".cuh")):
+        return "cuda"
+    if p.endswith((".cc", ".cpp", ".cxx")):
+        return "cpp"
+    if p.endswith(".c"):
+        return "c"
+    if p.endswith(".h") or p.endswith(".hpp"):
+        return "h"
+    # Default to plaintext if unknown
+    return ""
+
+
+def _format_context_files_block(context_files: list[dict[str, str]]) -> str:
+    blocks: list[str] = []
+    for source in context_files:
+        if "path" not in source or "content" not in source:
+            continue
+
+        fence = _fence_for_path(source["path"])
+        blocks.append(
+            _CONTEXT_FILES_BLOCK_TEMPLATE.format(path=source["path"], fence=fence, content=source["content"])
+        )
+    return "".join(blocks)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Download and prepare nvidia/compute-eval dataset")
+    parser.add_argument(
+        "--release",
+        type=str,
+        default=None,
+        help="Release to download (e.g., '2025-1', '2025-2'). If not specified, downloads default release.",
+    )
+
+    args = parser.parse_args()
+
+    token = os.getenv("HF_TOKEN", None)
+    if not token:
+        print("Error: HF_TOKEN environment variable not set. Please set it to access the dataset.")
+        exit(1)
+
+    dataset = load_dataset("nvidia/compute-eval", args.release, token=token)
+    data_dir = Path(__file__).absolute().parent
+    data_dir.mkdir(exist_ok=True)
+
+    with open(data_dir / "eval.jsonl", "wt", encoding="utf-8") as f:
+        for item in dataset["eval"]:
+            record = {
+                "problem": item,
+                "task_id": item["task_id"],
+                "problem_prompt": item["prompt"],
+                "build_command": item["build_command"],
+                "context_files_block": _format_context_files_block(item["context_files"]),
+            }
+
+            # Dumping using default=str to handle datetime serialization from the problem records
+            f.write(json.dumps(record, default=str) + "\n")
diff --git a/nemo_skills/evaluation/evaluator/__init__.py b/nemo_skills/evaluation/evaluator/__init__.py
@@ -27,6 +27,7 @@
     eval_livebench_coding,
     eval_livecodebench_pro,
 )
+from nemo_skills.evaluation.evaluator.compute_eval import ComputeEvalEvaluator
 from nemo_skills.evaluation.evaluator.icpc import ICPCEvaluator
 from nemo_skills.evaluation.evaluator.ifbench import eval_ifbench
 from nemo_skills.evaluation.evaluator.ifeval import eval_if
@@ -69,6 +70,7 @@
     "icpc": ICPCEvaluator,
     "audio": AudioEvaluator,
     "bird": BirdEvaluator,
+    "compute-eval": ComputeEvalEvaluator,
 }
 
 # Validation: Ensure no overlap between class and function maps

diff --git a/nemo_skills/evaluation/evaluator/compute_eval.py b/nemo_skills/evaluation/evaluator/compute_eval.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+import logging
+from typing import Annotated, Any
+
+from compute_eval.data.data_model import CudaCppProblem, CudaPythonProblem, FileSolution, PatchSolution
+from compute_eval.execution import evaluate_solution
+from compute_eval.utils.eval_utils import get_nvcc_version, parse_semver
+from pydantic import Field, TypeAdapter
+
+from nemo_skills.evaluation.evaluator import BaseEvaluator
+from nemo_skills.utils import get_logger_name
+
+_LOG = logging.getLogger(get_logger_name(__file__))
+_PROBLEM_ADAPTER = TypeAdapter(Annotated[CudaCppProblem | CudaPythonProblem, Field(discriminator="type")])
+_SOLUTION_ADAPTER = TypeAdapter(Annotated[FileSolution | PatchSolution, Field(discriminator="type")])
+
+
+class ComputeEvalEvaluator(BaseEvaluator):
+    _installed_ctk_major: int
+    _installed_ctk_minor: int
+
+    def __init__(self, config: dict, num_parallel_requests=10):
+        super().__init__(config, num_parallel_requests)
+        nvcc_version = get_nvcc_version()
+        if not nvcc_version:
+            raise RuntimeError(
+                "NVCC not found. Please ensure that the CUDA Toolkit is installed and nvcc is in your PATH."
+            )
+
+        self._installed_ctk_major, self._installed_ctk_minor, _ = parse_semver(nvcc_version)
+
+    async def eval_single(self, data_point: dict[str, Any]) -> dict[str, Any]:
+        # noinspection PyBroadException
+        try:
+            problem = _PROBLEM_ADAPTER.validate_python(data_point["problem"])
+            solution = _SOLUTION_ADAPTER.validate_python(data_point["solution"])
+
+            graded = await asyncio.to_thread(
+                evaluate_solution,
+                installed_ctk_major=self._installed_ctk_major,
+                installed_ctk_minor=self._installed_ctk_minor,
+                problem=problem,
+                solution=solution,
+            )
+
+            return {
+                "passed": graded.passed,
+                "skipped": graded.skipped,
+                "elapsed_time": graded.elapsed_time,
+                "build_output": graded.build_output,
+                "test_output": graded.test_output,
+            }
+        except KeyError as e:
+            _LOG.error(f"Missing required field in data_point: {e}")
+            return {
+                "passed": False,
+                "skipped": False,
+                "elapsed_time": 0.0,
+                "build_output": "",
+                "test_output": "",
+                "error": f"Missing required field: {e}",
+            }
+        except Exception as e:
+            _LOG.error(f"Error during evaluation: {e}")
+            return {
+                "passed": False,
+                "skipped": False,
+                "elapsed_time": 0.0,
+                "build_output": "",
+                "test_output": "",
+                "error": str(e),
+            }
-        except Exception as e:
-            _LOG.error(f"Error during evaluation: {e}")
-            return {
-                "passed": False,
-                "error": str(e),
-            }
+        except KeyError as e:
+            _LOG.error(f"Missing required field in data_point: {e}")
+            return {
+                "passed": False,
+                "skipped": False,
+                "elapsed_time": 0.0,
+                "build_output": "",
+                "test_output": "",
+                "error": f"Missing required field: {e}",
+            }
+        except Exception as e:
+            _LOG.error(f"Error during evaluation: {e}")
+            return {
+                "passed": False,
+                "skipped": False,
+                "elapsed_time": 0.0,
+                "build_output": "",
+                "test_output": "",
+                "error": str(e),
+            }
-        except Exception as e:
-            _LOG.error(f"Error during evaluation: {e}")
-            return {
-                "passed": False,
-                "error": str(e),
-            }
+        except KeyError as e:
+            _LOG.error(f"Missing required field in data_point: {e}")
+            return {
+                "passed": False,
+                "skipped": False,
+                "elapsed_time": 0.0,
+                "build_output": "",
+                "test_output": "",
+                "error": f"Missing required field: {e}",
+            }
+        except Exception as e:
+            _LOG.error(f"Error during evaluation: {e}")
+            return {
+                "passed": False,
+                "skipped": False,
+                "elapsed_time": 0.0,
+                "build_output": "",
+                "test_output": "",
+                "error": str(e),
+            }
diff --git a/nemo_skills/evaluation/metrics/code_metrics.py b/nemo_skills/evaluation/metrics/code_metrics.py
@@ -121,3 +121,15 @@ def get_incorrect_sample(self, prediction: dict) -> dict:
     def update(self, predictions):
         super().update(predictions)
         self._compute_pass_at_k(predictions=predictions)
+
+
+class ComputeEvalMetrics(BaseMetrics):
+    def _get_score_dict(self, prediction: dict) -> dict[str, bool | int | float]:
+        return {"accuracy": prediction["passed"]}
+
+    def get_incorrect_sample(self, prediction: dict) -> dict:
+        return {"passed": False}
+
+    def update(self, predictions):
+        super().update(predictions)
+        self._compute_pass_at_k(predictions=predictions)
diff --git a/nemo_skills/evaluation/metrics/map_metrics.py b/nemo_skills/evaluation/metrics/map_metrics.py
@@ -24,6 +24,7 @@
 from nemo_skills.evaluation.metrics.bird_metrics import BirdMetrics
 from nemo_skills.evaluation.metrics.code_metrics import (
     BigCodeBenchMetrics,
+    ComputeEvalMetrics,
     EvalPlusMetrics,
     HumanEvalInfillingMetrics,
     LiveCodeBenchMetrics,
@@ -71,6 +72,7 @@
     "mmau_pro_closed_form": MMAUProMetrics,
     "mmau_pro_open_ended": MMAUProMetrics,
     "mmau_pro_instruction_following": MMAUProMetrics,
+    "compute-eval": ComputeEvalMetrics,
 }
 
 

diff --git a/nemo_skills/inference/eval/compute_eval.py b/nemo_skills/inference/eval/compute_eval.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import sys
+
+import hydra
+from compute_eval.data.data_model import FileSolution
+
+# noinspection PyProtectedMember
+from compute_eval.generate_completions import _parse_solution
+
+from nemo_skills.inference.generate import GenerateSolutionsConfig, GenerationTask
+from nemo_skills.inference.model import server_params
+from nemo_skills.utils import (
+    get_help_message,
+    get_logger_name,
+    setup_logging,
+)
+
+_LOG = logging.getLogger(get_logger_name(__file__))
+
+
+class ComputeEvalGenerationTask(GenerationTask):
+    def __init__(self, cfg: GenerateSolutionsConfig):
+        super().__init__(cfg)
+
+    async def process_single_datapoint(self, data_point, data):
+        res = await super().process_single_datapoint(data_point, data)
+        try:
+            solution = FileSolution(
+                task_id=data_point["task_id"],
+                files=_parse_solution(res["generation"]),
+            )
+            return {
+                "solution": solution.model_dump(),
+                "generation": res["generation"],
+            }
+        except KeyError as e:
+            _LOG.error(f"Missing required field: {e}")
+            raise
+        except Exception as e:
+            _LOG.error(f"Failed to parse solution: {e}")
+            raise
+
+
+GENERATION_TASK_CLASS = ComputeEvalGenerationTask
+
+
+@hydra.main(version_base=None, config_name="base_generation_config")
+def run_compute_eval(cfg: GenerateSolutionsConfig):
+    _LOG.info("Config used: %s", cfg)
+
+    task = ComputeEvalGenerationTask(cfg)
+    task.generate()
+
+
+if __name__ == "__main__":
+    if "--help" in sys.argv or "-h" in sys.argv:
+        print(get_help_message(GenerateSolutionsConfig, server_params=server_params()))
+    else:
+        setup_logging()
+        run_compute_eval()