NVIDIA-NeMo · Kipok · Feb 7, 2026 · Jan 26, 2026 · Jan 26, 2026 · Jan 26, 2026
diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
@@ -46,6 +46,7 @@
     server_params,
 )
 from nemo_skills.inference.model.base import EndpointType
+from nemo_skills.inference.structured_outputs import STRUCTURED_OUTPUTS
 from nemo_skills.prompt.utils import get_prompt, get_token_count
 from nemo_skills.utils import (
     chunk_data,
@@ -218,6 +219,8 @@ class GenerationTaskConfig:
     eval_type: str | None = None  # "lean4-proof", "math", etc.
     eval_config: dict = field(default_factory=dict)  # Config for the evaluator
 
+    structured_output: str | None = None
+
     def __post_init__(self):
         self._post_init_validate_data()
         self._post_init_validate_server()
@@ -630,6 +633,14 @@ async def postprocess_single_output(self, output, original_data_point):
         # all of the original data to the output file alongside the new generations
         output[self.cfg.generation_key] = output.pop("generation")
 
+        if self.cfg.structured_output == "HLE_JUDGE_AA":
 if self.cfg.parse_reasoning: 
 if self.cfg.parse_reasoning: 
+            try:
+                output[self.cfg.generation_key] = "Judgement: {}".format(
+                    json.loads(output[self.cfg.generation_key])["correct"]
+                )
+            except (json.JSONDecodeError, KeyError):
+                output[self.cfg.generation_key] = "Judgement: FAILED_TO_PARSE"
+
         if not self.cfg.add_generation_stats:
             output.pop("generation_start_time", None)
             output.pop("generation_end_time", None)
@@ -681,6 +692,9 @@ async def process_single_datapoint(self, data_point, all_data):
             "stop_phrases": [self.cfg.stop_phrase] if self.cfg.stop_phrase else None,
         }
 
+        if self.cfg.structured_output in STRUCTURED_OUTPUTS:
+            generation_params["response_format"] = STRUCTURED_OUTPUTS[self.cfg.structured_output]
+
         if self.cfg.code_execution:
             if self.cfg.override_max_code_executions and self.cfg.total_code_executions_in_prompt is not None:
                 generation_params["max_code_executions"] = data_point["total_code_executions"]

diff --git a/nemo_skills/inference/model/base.py b/nemo_skills/inference/model/base.py
@@ -236,6 +236,7 @@ async def generate_async(
         tools: list[dict] | None = None,
         include_response: bool = False,
         extra_body: dict = None,
+        response_format=None,
     ) -> dict:
         if endpoint_type is None:
             # Infering completion type from prompt
@@ -261,6 +262,7 @@ async def generate_async(
             "reasoning_effort": reasoning_effort,
             "tools": tools,
             "extra_body": extra_body,
+            "response_format": response_format,
         }
 
         # TODO: remove this after we no longer use gpt-oss or it's fixed in vllm

diff --git a/nemo_skills/inference/model/gemini.py b/nemo_skills/inference/model/gemini.py
@@ -57,6 +57,7 @@ def _build_chat_request_params(
         reasoning_effort: str | None,
         extra_body: dict = None,
         tools: list[dict] | None = None,
+        response_format=None,
     ) -> dict:
         """
         https://github.com/BerriAI/litellm/blob/v1.75.0-nightly/litellm/constants.py#L45-L56
@@ -72,6 +73,8 @@ def _build_chat_request_params(
             "`repetition_penalty` is not supported by Gemini API, please set it to default value `1.0`."
         )
         assert not extra_body, "`extra_body` is not supported by Gemini API, please set it to None or empty dict"
+        if response_format is not None:
+            raise NotImplementedError()
 
         # Vertext AI params: https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference
         # litellm default params: https://github.com/BerriAI/litellm/blob/v1.75.0-nightly/litellm/llms/gemini/chat/transformation.py#L73-L90

diff --git a/nemo_skills/inference/model/megatron.py b/nemo_skills/inference/model/megatron.py
@@ -36,6 +36,7 @@ def _build_chat_request_params(
         stop_phrases: list[str] | None = None,
         timeout: int | None = None,
         top_logprobs: int | None = None,
+        response_format=None,
         **kwargs,
     ) -> dict:
         # Validations
@@ -48,6 +49,7 @@ def _build_chat_request_params(
         if top_k != -1:
             raise NotImplementedError("Megatron server does not support top_k parameter.")
         assert kwargs.get("tools") is None, "Megatron server does not support tools parameter."
+        assert response_format is None, "Megatron server does not support response_format parameter."
 
         params = {
             "messages": messages,
@@ -81,6 +83,7 @@ def _build_completion_request_params(
         stop_phrases: list[str] | None = None,
         timeout: int | None = None,
         top_logprobs: int | None = None,
+        response_format=None,
         **kwargs,
     ) -> dict:
         # Parameter validation specific to Megatron
@@ -93,6 +96,7 @@ def _build_completion_request_params(
         if top_k != -1:
             raise NotImplementedError("Megatron server does not support top_k parameter.")
         assert kwargs.get("tools") is None, "Megatron server does not support tools parameter."
+        assert response_format is None, "Megatron server does not support response_format parameter."
 
         return {
             "prompt": prompt,

diff --git a/nemo_skills/inference/model/openai.py b/nemo_skills/inference/model/openai.py
@@ -69,6 +69,9 @@ def _build_completion_request_params(self, **kwargs) -> dict:
         assert kwargs.pop("reasoning_effort", None) is None, (
             "reasoning_effort is not supported by completion requests."
         )
+        assert kwargs.pop("response_format", None) is None, (
+            "response_format is not supported by completion requests."
+        )
         assert kwargs.pop("top_k", -1) == -1, "`top_k` is not supported by OpenAI API, please set it to -1."
         assert kwargs.pop("min_p", 0.0) == 0.0, "`min_p` is not supported by OpenAI API, please set it to 0.0."
         assert kwargs.pop("repetition_penalty", 1.0) == 1.0, (
@@ -100,6 +103,7 @@ def _build_chat_request_params(
         reasoning_effort: str | None,
         extra_body: dict = None,
         tools: list[dict] | None = None,
+        response_format=None,
     ) -> dict:
         # Validations
         if top_k != -1:
@@ -116,6 +120,7 @@ def _build_chat_request_params(
             "timeout": timeout,
             "stream": stream,
             "tools": tools,
+            "response_format": response_format,
         }
 
         if self._is_reasoning_model(self.model):

diff --git a/nemo_skills/inference/model/sglang.py b/nemo_skills/inference/model/sglang.py
@@ -39,6 +39,7 @@ def _build_chat_request_params(
         reasoning_effort: str | None = None,
         tools: list[dict] | None = None,
         extra_body: dict = None,
+        response_format=None,
     ) -> dict:
         request = super()._build_chat_request_params(
             messages=messages,
@@ -56,6 +57,7 @@ def _build_chat_request_params(
             reasoning_effort=reasoning_effort,
             tools=tools,
             extra_body=extra_body,
+            response_format=response_format,
         )
         # SGLang requires tool_choice in the request body when tools are provided
         if tools is not None:

diff --git a/nemo_skills/inference/model/vllm.py b/nemo_skills/inference/model/vllm.py
@@ -143,9 +143,11 @@ def _build_completion_request_params(
         reasoning_effort: str | None = None,
         extra_body: dict = None,
         tools: list[dict] | None = None,
+        response_format=None,
     ) -> dict:
         assert reasoning_effort is None, "reasoning_effort is not supported for text completion requests"
         assert tools is None, "tools are not supported for text completion requests"
+        assert response_format is None, "response_format is not supported for text completion requests"
         return {
             "prompt": prompt,
             "max_tokens": tokens_to_generate,
@@ -182,6 +184,7 @@ def _build_chat_request_params(
         reasoning_effort: str | None = None,
         tools: list[dict] | None = None,
         extra_body: dict = None,
+        response_format=None,
     ) -> dict:
         # Process messages to handle image content (VLM support)
         processed_messages = []
@@ -207,6 +210,7 @@ def _build_chat_request_params(
             "timeout": timeout,
             "extra_body": self._build_request_body(top_k, min_p, repetition_penalty, extra_body=extra_body),
             "tools": tools,
+            "response_format": response_format,
         }
         if reasoning_effort:
             request["allowed_openai_params"] = ["reasoning_effort"]

diff --git a/nemo_skills/inference/structured_outputs.py b/nemo_skills/inference/structured_outputs.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Literal
+
+from pydantic import BaseModel
+
+
+class HLEJudgeAAResponseFormat(BaseModel):
+    extracted_final_answer: str
+    reasoning: str
+    correct: Literal["yes", "no"]
+    confidence: int
-    confidence: int
+    confidence: int = Field(ge=0, le=100, description="Confidence score from 0 to 100")
-    confidence: int
+    confidence: int = Field(ge=0, le=100, description="Confidence score from 0 to 100")
+
+
+STRUCTURED_OUTPUTS = {
+    "HLE_JUDGE_AA": HLEJudgeAAResponseFormat,
+}