fix: address PR review comments

viraatc · claude · viraatc · commit d6e5d7bc13e0 · 2026-03-24T17:52:41.000-07:00
- Move ThreadPoolExecutor import to module level (PEP 8)
- Remove unused logger variable
- Use strict=True in zip() calls to catch length mismatches
- Add comment explaining early-stop timing in session loop

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/inference_endpoint/load_generator/session.py b/src/inference_endpoint/load_generator/session.py
@@ -92,6 +92,10 @@ def _run_test(
                     data=get_version_info(),
                 )
 
+                # Note: stop_requested is checked after each iteration, so one
+                # additional sample may be issued after the flag is set. This is
+                # acceptable — the alternative (checking before next()) would
+                # require breaking the generator protocol.
                 for _ in perf_test_generator:
                     if self.stop_requested:
                         self.logger.info(
diff --git a/src/inference_endpoint/metrics/reporter.py b/src/inference_endpoint/metrics/reporter.py
@@ -25,6 +25,7 @@
 import sqlite3
 from collections import defaultdict
 from collections.abc import Callable, Iterable
+from concurrent.futures import ThreadPoolExecutor
 from enum import Enum
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
@@ -39,8 +40,6 @@
 if TYPE_CHECKING:
     from transformers import Tokenizer
 
-logger = logging.getLogger(__name__)
-
 
 def _parallel_batch_tokenize(tokenizer: Tokenizer, texts: list[str]) -> list[int]:
     """Batch-tokenize texts using all available cores and return token counts.
@@ -49,7 +48,6 @@ def _parallel_batch_tokenize(tokenizer: Tokenizer, texts: list[str]) -> list[int
     HuggingFace tokenizers use a Rust backend that releases the GIL,
     so threads achieve real parallelism without GIL contention.
     """
-    from concurrent.futures import ThreadPoolExecutor
 
     n_cores = os.cpu_count() or 1
     n_workers = max(1, int(n_cores * 0.95))
@@ -1093,7 +1091,7 @@ def get_output_sequence_lengths(
 
         # Parallel batch tokenize across ~95% of cores
         token_counts = _parallel_batch_tokenize(tokenizer, texts)
-        rows = list(zip(uuids, token_counts, strict=False))
+        rows = list(zip(uuids, token_counts, strict=True))
 
         return RollupQueryTable("output_sequence_length", None, rows)
 
@@ -1211,7 +1209,7 @@ def derive_TPOT(
             repeats = None
 
         for sample_uuid, n_non_first_tokens in zip(
-            batch_uuids, token_counts, strict=False
+            batch_uuids, token_counts, strict=True
         ):
             latency = sample_latency_rollup.filter_uuid(sample_uuid, only_first=True)
             if latency is None: