feat(interceptors): add reasoning ratio stats (#618)

gchlebus · prokotg · web-flow · commit 53a159e0a86b · 2026-01-13T11:30:51.000+01:00
- Introduced a new statistic, `reasoning_unfinished_count`,
`reasoning_finished_ratio`, to track responses where reasoning started
but did not complete and finished ratio to all reasoning responses.
- Updated the logic in `ResponseReasoningInterceptor` to increment this
count appropriately.
- Added unit tests to validate the correct tracking of reasoning states,
ensuring the mathematical invariant between started and finished counts
is maintained.
- Updated documentation to reflect the new statistic and its
significance in evaluating reasoning performance.

&lt;!-- This is an auto-generated comment: release notes by coderabbit.ai
--&gt;
## Summary by CodeRabbit

* **New Features**
* Added two reasoning metrics: reasoning_unfinished_count (counts
started-but-incomplete reasoning) and reasoning_finished_ratio (fraction
of completed reasoning).

* **Documentation**
* Updated evaluation, interceptor, and tutorial docs to include the new
metrics in examples, metric tables, and artifact descriptions.

* **Tests**
* Added parameterized tests covering finished, unfinished, not-started,
explicit-content, and edge-case reasoning scenarios to validate counts
and ratio.

&lt;sub&gt;✏️ Tip: You can customize this high-level summary in your review
settings.&lt;/sub&gt;
&lt;!-- end of auto-generated comment: release notes by coderabbit.ai --&gt;

---------

Signed-off-by: Grzegorz Chlebus &lt;gchlebus@nvidia.com&gt;
Signed-off-by: Tomasz Grzegorzek &lt;tgrzegorzek@nvidia.com&gt;
Co-authored-by: Tomasz Grzegorzek &lt;tgrzegorzek@nvidia.com&gt;
diff --git a/docs/evaluation/run-evals/reasoning.md b/docs/evaluation/run-evals/reasoning.md
@@ -227,7 +227,9 @@ When the reasoning interceptor is enabled, this file contains a `reasoning` key
     "total_responses": 3672,
     "responses_with_reasoning": 2860,
     "reasoning_finished_count": 2860,
+    "reasoning_finished_ratio": 1.0,
     "reasoning_started_count": 2860,
+    "reasoning_unfinished_count": 0,
     "avg_reasoning_words": 153.21,
     "avg_original_content_words": 192.17,
     "avg_updated_content_words": 38.52,
@@ -248,7 +250,7 @@ When the reasoning interceptor is enabled, this file contains a `reasoning` key
 
 In the example above, the model used reasoning for 2860 out of 3672 responses (approximately 78%).
 
-The matching values for `reasoning_started_count` and `reasoning_finished_count` indicate that the `max_new_tokens` parameter was set sufficiently high, allowing the model to complete all reasoning traces without truncation.
+The matching values for `reasoning_started_count` and `reasoning_finished_count` (and `reasoning_unfinished_count` being 0) indicate that the `max_new_tokens` parameter was set sufficiently high, allowing the model to complete all reasoning traces without truncation.
 
 These statistics also enable cost analysis for reasoning operations.
 While the endpoint in this example does not return reasoning token usage statistics (the `*_tokens` fields are null or zero), you can still analyze computational cost using the word count metrics from the responses.
diff --git a/docs/libraries/nemo-evaluator/interceptors/reasoning.md b/docs/libraries/nemo-evaluator/interceptors/reasoning.md
@@ -75,7 +75,9 @@ The interceptor automatically tracks the following statistics:
 | `total_responses` | Total number of responses processed |
 | `responses_with_reasoning` | Number of responses containing reasoning content |
 | `reasoning_finished_count` | Number of responses where reasoning completed (end token found) |
+| `reasoning_finished_ratio` | Percentage (expressed as ratio within 0-1) of responses where reasoning completed to all responses with reasoning |
 | `reasoning_started_count` | Number of responses where reasoning started |
+| `reasoning_unfinished_count` | Number of responses where reasoning started but did not complete (end token not found) |
 | `avg_reasoning_words` | Average word count in reasoning content |
 | `avg_reasoning_tokens` | Average token count in reasoning content |
 | `avg_original_content_words` | Average word count in original content (before processing) |
diff --git a/docs/tutorials/how-to/reasoning.md b/docs/tutorials/how-to/reasoning.md
@@ -226,6 +226,8 @@ After evaluation completes, check these key artifacts:
 - **`eval_factory_metrics.json`**: Contains reasoning statistics under the `reasoning` key, including:
   - `responses_with_reasoning`: How many responses included reasoning traces
   - `reasoning_finished_count` vs `reasoning_started_count`: If these match, your `max_new_tokens` was sufficient
+  - `reasoning_unfinished_count`: Number of responses where reasoning started but was truncated (didn't reach end token)
+  - `reasoning_finished_ratio`: Percentage (expressed as ratio within 0-1) of responses where reasoning completed to all responses with reasoning
   - `avg_reasoning_words` and other word- and tokens count metrics: Use these for cost analysis
 
 :::{tip}
diff --git a/packages/nemo-evaluator/src/nemo_evaluator/adapters/interceptors/reasoning_interceptor.py b/packages/nemo-evaluator/src/nemo_evaluator/adapters/interceptors/reasoning_interceptor.py
@@ -127,6 +127,8 @@ def __init__(self, params: Params):
             "responses_with_reasoning": 0,
             "reasoning_finished_count": 0,
             "reasoning_started_count": 0,
+            "reasoning_unfinished_count": 0,
+            "reasoning_finished_ratio": 0,
             "avg_reasoning_words": None,
             "avg_original_content_words": None,
             "avg_updated_content_words": None,
@@ -281,12 +283,18 @@ def _update_reasoning_stats(self, reasoning_info: dict) -> None:
             )
 
             # Increment counters
-            if reasoning_words > 0:
+            if (
+                reasoning_words == "unknown"
+                and reasoning_info.get("reasoning_started") is True
+            ) or (isinstance(reasoning_words, int) and reasoning_words > 0):
+                # if reasoning started but not finished, or finished and we have non-zero reasoning words
                 self._reasoning_stats["responses_with_reasoning"] += 1
-            if reasoning_info.get("reasoning_started"):
+            if reasoning_info.get("reasoning_started") is True:
                 self._reasoning_stats["reasoning_started_count"] += 1
-            if reasoning_info.get("reasoning_finished"):
-                self._reasoning_stats["reasoning_finished_count"] += 1
+                if reasoning_info.get("reasoning_finished"):
+                    self._reasoning_stats["reasoning_finished_count"] += 1
+                else:
+                    self._reasoning_stats["reasoning_unfinished_count"] += 1
 
             # Update running averages
             for stat_key, value in [
@@ -340,6 +348,13 @@ def _update_reasoning_stats(self, reasoning_info: dict) -> None:
                     updated_content_tokens
                 )
 
+            # Update ratio
+            if self._reasoning_stats["responses_with_reasoning"]:
+                self._reasoning_stats["reasoning_finished_ratio"] = (
+                    self._reasoning_stats["reasoning_finished_count"]
+                    / self._reasoning_stats["responses_with_reasoning"]
+                )
+
             # Log aggregated stats at specified interval
             if (
                 self._reasoning_stats["total_responses"]
diff --git a/packages/nemo-evaluator/tests/unit_tests/adapters/interceptors/test_reasoning.py b/packages/nemo-evaluator/tests/unit_tests/adapters/interceptors/test_reasoning.py
@@ -449,6 +449,115 @@ def test_get_reasoning_info_explicit_content(
     assert reasoning_info["reasoning_started"]
 
 
+@pytest.mark.parametrize(
+    "test_name,reasoning_started,reasoning_finished,expected_started_count,expected_finished_count,expected_unfinished_count",
+    [
+        (
+            "reasoning_started_and_finished",
+            True,
+            True,
+            1,  # Started
+            1,  # Finished
+            0,  # Reasoning completed, not unfinished
+        ),
+        (
+            "reasoning_started_not_finished",
+            True,
+            False,
+            1,  # Started
+            0,  # Not finished
+            1,  # Reasoning started but truncated
+        ),
+        (
+            "reasoning_not_started",
+            False,
+            False,
+            0,  # Not started
+            0,  # Not finished
+            0,  # Reasoning never started
+        ),
+        (
+            "reasoning_not_started_but_finished_flag_true",
+            # Edge case: reasoning_content is empty but content is non-empty
+            # This can happen when reasoning_content="" and content="Final answer"
+            # In this case, reasoning_finished=True but reasoning_started=False
+            # We should NOT count this as finished since it never started
+            False,
+            True,
+            0,  # Not started
+            0,  # Should NOT be counted as finished since it never started
+            0,  # Not unfinished either since it never started
+        ),
+        (
+            "reasoning_started_unknown",
+            # Edge case: start_reasoning_token is None and no end token found
+            # In this case, reasoning_started="unknown" (truthy string)
+            # We should NOT count this as started since we don't know
+            "unknown",
+            False,
+            0,  # Unknown should NOT be counted as started
+            0,  # Not finished
+            0,  # Not unfinished since we don't know if it started
+        ),
+    ],
+)
+def test_reasoning_unfinished_count(
+    test_name,
+    reasoning_started,
+    reasoning_finished,
+    expected_started_count,
+    expected_finished_count,
+    expected_unfinished_count,
+):
+    """Test that reasoning_unfinished_count is correctly tracked.
+
+    Maintains the mathematical invariant:
+    unfinished_count = started_count - finished_count
+    """
+    interceptor = ResponseReasoningInterceptor(
+        params=ResponseReasoningInterceptor.Params(
+            add_reasoning=True,
+            enable_reasoning_tracking=True,
+            enable_caching=False,
+        )
+    )
+
+    # Simulate reasoning info from _process_reasoning_message
+    reasoning_info = {
+        "reasoning_words": 10 if reasoning_started else 0,
+        "original_content_words": 15 if reasoning_started else 5,
+        "updated_content_words": 5,
+        "reasoning_finished": reasoning_finished,
+        "reasoning_started": reasoning_started,
+        "reasoning_tokens": "unknown",
+        "updated_content_tokens": "unknown",
+    }
+
+    # Update stats with the reasoning info
+    interceptor._update_reasoning_stats(reasoning_info)
+
+    # Verify the counts
+    assert (
+        interceptor._reasoning_stats["reasoning_started_count"]
+        == expected_started_count
+    )
+    assert (
+        interceptor._reasoning_stats["reasoning_finished_count"]
+        == expected_finished_count
+    )
+    assert (
+        interceptor._reasoning_stats["reasoning_unfinished_count"]
+        == expected_unfinished_count
+    )
+
+    # Verify the mathematical invariant: unfinished = started - finished
+    assert (
+        interceptor._reasoning_stats["reasoning_unfinished_count"]
+        == interceptor._reasoning_stats["reasoning_started_count"]
+        - interceptor._reasoning_stats["reasoning_finished_count"]
+    )
+
+
 @pytest.mark.parametrize(
     "test_name,message_content,expected_reasoning_words,expected_original_content_words,expected_reasoning_finished",
     [
@@ -499,6 +608,54 @@ def test_get_reasoning_info_embedded_content(
     assert reasoning_info["reasoning_started"] == "unknown"
 
 
+def test_reasoning_ratio():
+    """Test _process_reasoning_message when reasoning content is embedded in the message content."""
+    interceptor = ResponseReasoningInterceptor(
+        params=ResponseReasoningInterceptor.Params(
+            add_reasoning=True,
+            enable_reasoning_tracking=True,
+            end_reasoning_token="</think>",
+            start_reasoning_token="<think>",
+            enable_caching=False,
+        )
+    )
+    # Create message with embedded reasoning content
+    messages = []
+    n_finished_reasoning = 7
+    n_unfinished_reasoning = 3
+    n_no_reasoning = 20
+
+    messages.extend(
+        [
+            {
+                "role": "assistant",
+                "content": "<think> thinking trace </think> rest of the message",
+            }
+            for _ in range(n_finished_reasoning)
+        ]
+    )
+    messages.extend(
+        [
+            {"role": "assistant", "content": "<think> thinking trace unfinished"}
+            for _ in range(n_unfinished_reasoning)
+        ]
+    )
+    messages.extend(
+        [
+            {"role": "assistant", "content": "no thinking trace"}
+            for _ in range(n_no_reasoning)
+        ]
+    )
+
+    reasoning_info = None
+
+    # Test the _process_reasoning_message method directly
+    for message in messages:
+        _, reasoning_info = interceptor._process_reasoning_message(message)
+        interceptor._update_reasoning_stats(reasoning_info)
+    assert interceptor._reasoning_stats["reasoning_finished_ratio"] == 0.7
+
+
 @pytest.mark.parametrize(
     "test_name,include_if_not_finished,message_content,expected_content,expected_reasoning_words,expected_original_content_words",
     [