fix: address fourth round of PR review comments

viraatc · claude · viraatc · commit a8c8b2eb3fa9 · 2026-03-30T23:01:37.000-07:00
- Fix misleading log messages after early stop (now distinguishes
  "aborted early" from "all samples issued")
- Fix monkeypatch raising=False for cross-platform sched_getaffinity
- Fix docstring: 2 CPUs → 4 CPUs to match actual test setup
- Add thread-safety note to _parallel_batch_tokenize docstring

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/inference_endpoint/load_generator/session.py b/src/inference_endpoint/load_generator/session.py
@@ -106,7 +106,10 @@ def _run_test(
                 EventRecorder.record_event(
                     SessionEvent.STOP_PERFORMANCE_TRACKING, time.monotonic_ns()
                 )
-                self.logger.info("All performance samples issued")
+                if self.stop_requested:
+                    self.logger.info("Performance sample issuance aborted early")
+                else:
+                    self.logger.info("All performance samples issued")
 
                 if accuracy_test_generators and not self.stop_requested:
                     for _, generator in accuracy_test_generators.items():
@@ -116,7 +119,10 @@ def _run_test(
                         if self.stop_requested:
                             break
 
-                self.logger.info("All accuracy samples issued")
+                if self.stop_requested:
+                    self.logger.info("Accuracy sample issuance aborted early")
+                else:
+                    self.logger.info("All accuracy samples issued")
 
                 self.event_recorder.should_check_idle = True
                 EventRecorder.record_event(
diff --git a/src/inference_endpoint/metrics/reporter.py b/src/inference_endpoint/metrics/reporter.py
@@ -47,6 +47,8 @@ def _parallel_batch_tokenize(tokenizer: Tokenizer, texts: list[str]) -> list[int
     Uses a ThreadPoolExecutor to parallelize across ~95% of CPU cores.
     HuggingFace tokenizers use a Rust backend that releases the GIL,
     so threads achieve real parallelism without GIL contention.
+    A single tokenizer instance is shared across threads — this is safe for
+    PreTrainedTokenizerFast (Rust-backed, thread-safe by design).
     """
 
     try:
diff --git a/tests/unit/metrics/test_reporter.py b/tests/unit/metrics/test_reporter.py
@@ -1186,12 +1186,14 @@ def test_empty_data(self):
 def test_parallel_batch_tokenize_threaded_path(tokenizer, monkeypatch):
     """Exercise the threaded branch of _parallel_batch_tokenize.
 
-    Monkeypatches os.sched_getaffinity to return 2 CPUs so the threaded path
+    Monkeypatches os.sched_getaffinity to return 4 CPUs so the threaded path
     triggers with a modest number of texts, and verifies ordering and counts.
     """
     # Force 4 CPUs so n_workers=3, then provide 5 texts to exceed the
     # direct-tokenize threshold and exercise the threaded chunking path.
-    monkeypatch.setattr(os, "sched_getaffinity", lambda _pid: {0, 1, 2, 3})
+    monkeypatch.setattr(
+        os, "sched_getaffinity", lambda _pid: {0, 1, 2, 3}, raising=False
+    )
     texts = ["hello", "ab", "xyz", "a", "test!"]
     result = _parallel_batch_tokenize(tokenizer, texts)
     # CharacterTokenizer returns len(text) as token count