Move thread-local storage inside TokenizePool instance. Remove unused .tokenize call - only .token_count is used

nv-alicheng · nv-alicheng · commit 43a3769b0b2a · 2026-03-19T17:33:55.000-07:00
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/__main__.py
@@ -17,6 +17,7 @@
 
 import argparse
 import asyncio
+from contextlib import AbstractContextManager, nullcontext
 from pathlib import Path
 
 from inference_endpoint.async_utils.loop_manager import LoopManager
@@ -63,27 +64,31 @@ async def main() -> None:
     shutdown_event = asyncio.Event()
     loop = LoopManager().default_loop
 
-    pool = None
+    # Using ternary operator causes errors in MyPy object type coalescing
+    # (coalesces to 'object' not 'AbstractContextManager[TokenizePool | None]')
     if args.tokenizer:
-        pool = TokenizePool(args.tokenizer, n_workers=args.tokenizer_workers)
+        pool_cm: AbstractContextManager[TokenizePool | None] = TokenizePool(
+            args.tokenizer, n_workers=args.tokenizer_workers
+        )
+    else:
+        pool_cm = nullcontext()
 
-    try:
-        with ManagedZMQContext.scoped(socket_dir=args.metrics_dir.parent) as zmq_ctx:
-            emitter = JsonlMetricEmitter(metrics_file, flush_interval=100)
-            aggregator = MetricsAggregatorService(
-                args.socket_address,
-                zmq_ctx,
-                loop,
-                topics=None,
-                emitter=emitter,
-                tokenize_pool=pool,
-                shutdown_event=shutdown_event,
-            )
-            loop.call_soon_threadsafe(aggregator.start)
-            await shutdown_event.wait()
-    finally:
-        if pool is not None:
-            pool.close()
+    with (
+        pool_cm as pool,
+        ManagedZMQContext.scoped(socket_dir=args.metrics_dir.parent) as zmq_ctx,
+    ):
+        emitter = JsonlMetricEmitter(metrics_file, flush_interval=100)
+        aggregator = MetricsAggregatorService(
+            args.socket_address,
+            zmq_ctx,
+            loop,
+            topics=None,
+            emitter=emitter,
+            tokenize_pool=pool,
+            shutdown_event=shutdown_event,
+        )
+        loop.call_soon_threadsafe(aggregator.start)
+        await shutdown_event.wait()
 
 
 if __name__ == "__main__":
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/emitter.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/emitter.py
@@ -20,6 +20,7 @@
 import time
 from abc import ABC, abstractmethod
 from pathlib import Path
+from typing import TextIO
 
 import msgspec
 
@@ -58,12 +59,14 @@ class JsonlMetricEmitter(MetricEmitter):
 
     def __init__(self, file_path: Path, flush_interval: int = 100) -> None:
         self._file_path = file_path.with_suffix(".jsonl")
-        self._file = self._file_path.open("w")
+        self._file: TextIO | None = self._file_path.open("w")
         self._encoder = msgspec.json.Encoder()
         self._flush_interval = flush_interval
         self._n_since_flush = 0
 
     def emit(self, sample_uuid: str, metric_name: str, value: int | float) -> None:
+        if self._file is None:
+            return
         record = _MetricRecord(
             sample_uuid=sample_uuid,
             metric_name=metric_name,
@@ -89,4 +92,4 @@ def close(self) -> None:
                 # File may already be closed or I/O error on close (e.g. disk full).
                 pass
             finally:
-                self._file = None  # type: ignore[assignment]
+                self._file = None
diff --git a/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py b/src/inference_endpoint/async_utils/services/metrics_aggregator/token_metrics.py
@@ -28,29 +28,6 @@
     from transformers import PreTrainedTokenizerBase
 
 
-# Create a thread-local storage for the tokenizer so each thread contains its own instance.
-_thread_local = threading.local()
-
-
-def _get_thread_tokenizer(tokenizer_name: str) -> PreTrainedTokenizerBase:
-    """Return the tokenizer for the current thread, loading it if needed."""
-    if not hasattr(_thread_local, "tokenizer") or _thread_local.tokenizer is None:
-        _thread_local.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
-    return _thread_local.tokenizer
-
-
-def _tokenize_worker(tokenizer_name: str, text: str) -> list[str]:
-    """Worker entry: load tokenizer for this thread and tokenize."""
-    tokenizer = _get_thread_tokenizer(tokenizer_name)
-    return tokenizer.tokenize(text)
-
-
-def _token_count_worker(tokenizer_name: str, text: str) -> int:
-    """Worker entry: return the number of tokens in text."""
-    tokenizer = _get_thread_tokenizer(tokenizer_name)
-    return len(tokenizer.encode(text))
-
-
 class TokenizePool:
     """A pool of worker threads, each with its own HuggingFace AutoTokenizer.
 
@@ -64,35 +41,56 @@ class TokenizePool:
     - The ThreadPoolExecutor itself is thread-safe (submit/shutdown are synchronized).
     - Each worker thread has its own tokenizer via thread-local storage, so there
       is no shared mutable state during tokenization.
-    - The blocking `tokenize()` / `token_count()` methods are safe to call from
-      multiple threads concurrently.
-    - In an async context, use the `_async` variants to avoid blocking the event loop.
-      These use `loop.run_in_executor(None, ...)` to offload to the default executor,
-      which then submits to the TokenizePool's own ThreadPoolExecutor.
+    - The blocking `token_count()` method is safe to call from multiple threads
+      concurrently.
+    - In an async context, use `token_count_async` to avoid blocking the event loop.
     """
 
     def __init__(self, tokenizer_name: str, n_workers: int) -> None:
         if n_workers < 1:
             raise ValueError("n_workers must be at least 1")
         self._tokenizer_name = tokenizer_name
         self._n_workers = n_workers
+        self._thread_local = threading.local()
         self._executor: ThreadPoolExecutor | None = ThreadPoolExecutor(
             max_workers=n_workers,
             thread_name_prefix="TokenizePool",
         )
-
-    def tokenize(self, text: str) -> list[str]:
-        """Tokenize the input string via the worker pool (blocking)."""
-        if self._executor is None:
-            raise RuntimeError("TokenizePool is closed")
-        future = self._executor.submit(_tokenize_worker, self._tokenizer_name, text)
-        return future.result()
+        # Pre-load a tokenizer on every worker thread so the first real
+        # token_count call doesn't pay the AutoTokenizer.from_pretrained cost.
+        # Submitting n_workers tasks is guaranteed to hit every thread because
+        # AutoTokenizer.from_pretrained blocks long enough that no thread
+        # completes before all tasks are submitted.
+        # **IMPORTANT**: This is not a guarantee - for instance when using a mock
+        # object in tests for the tokenizer, the mock object *must* block in the 100ms
+        # range to simulate proper .from_pretrained behavior.
+        # It is not super impactful if a thread is not pre-initialized - it will just
+        # have to pay the cost of .from_pretrained on the first pool.token_count call
+        # for that thread.
+        futures = [
+            self._executor.submit(self._get_thread_tokenizer) for _ in range(n_workers)
+        ]
+        for f in futures:
+            f.result()
+
+    def _get_thread_tokenizer(self) -> PreTrainedTokenizerBase:
+        """Return the tokenizer for the current thread, loading it if needed."""
+        if getattr(self._thread_local, "tokenizer", None) is None:
+            self._thread_local.tokenizer = AutoTokenizer.from_pretrained(
+                self._tokenizer_name
+            )
+        return self._thread_local.tokenizer
+
+    def _token_count_worker(self, text: str) -> int:
+        """Worker entry: return the number of tokens in text."""
+        tokenizer = self._get_thread_tokenizer()
+        return len(tokenizer.tokenize(text))
 
     def token_count(self, text: str) -> int:
         """Return the number of tokens in the input string (blocking)."""
         if self._executor is None:
             raise RuntimeError("TokenizePool is closed")
-        future = self._executor.submit(_token_count_worker, self._tokenizer_name, text)
+        future = self._executor.submit(self._token_count_worker, text)
         return future.result()
 
     async def token_count_async(
@@ -106,7 +104,7 @@ async def token_count_async(
         if self._executor is None:
             raise RuntimeError("TokenizePool is closed")
         return await loop.run_in_executor(
-            self._executor, _token_count_worker, self._tokenizer_name, text
+            self._executor, self._token_count_worker, text
         )
 
     def close(self) -> None:
diff --git a/tests/unit/async_utils/services/event_logger/test_event_logger.py b/tests/unit/async_utils/services/event_logger/test_event_logger.py
@@ -117,7 +117,7 @@ def _make_stub(*args, **kwargs) -> tuple[StubEventLoggerService, list[FakeWriter
 
 @pytest.mark.unit
 class TestWriteDispatch:
-    @pytest.mark.asyncio(mode="strict")
+    @pytest.mark.asyncio
     @pytest.mark.parametrize(
         "case_desc, records",
         [
@@ -146,14 +146,14 @@ async def test_records_written_to_all_writers(self, case_desc, records):
         for writer in writers:
             assert writer.written == records
 
-    @pytest.mark.asyncio(mode="strict")
+    @pytest.mark.asyncio
     async def test_empty_batch(self):
         service, writers = _make_stub()
         await service.process([])
         for writer in writers:
             assert len(writer.written) == 0
 
-    @pytest.mark.asyncio(mode="strict")
+    @pytest.mark.asyncio
     async def test_multiple_batches_accumulate(self):
         service, writers = _make_stub()
         await service.process([_record(SampleEventType.ISSUED, uuid="s1")])
@@ -169,15 +169,15 @@ async def test_multiple_batches_accumulate(self):
 
 @pytest.mark.unit
 class TestShutdownBehavior:
-    @pytest.mark.asyncio(mode="strict")
+    @pytest.mark.asyncio
     async def test_session_ended_triggers_flush_and_close(self):
         service, writers = _make_stub()
         await service.process([_record(SessionEventType.ENDED, ts=100)])
         for writer in writers:
             assert writer.flush_count == 1
             assert writer.closed
 
-    @pytest.mark.asyncio(mode="strict")
+    @pytest.mark.asyncio
     @pytest.mark.parametrize(
         "case_desc, trailing_record",
         [
@@ -202,13 +202,13 @@ async def test_events_after_ended_same_batch(self, case_desc, trailing_record):
             assert len(writer.written) == 1
             assert writer.written[0].event_type == SessionEventType.ENDED
 
-    @pytest.mark.asyncio(mode="strict")
+    @pytest.mark.asyncio
     async def test_writers_cleared_after_shutdown(self):
         service, _ = _make_stub()
         await service.process([_record(SessionEventType.ENDED)])
         assert service.writers == []
 
-    @pytest.mark.asyncio(mode="strict")
+    @pytest.mark.asyncio
     async def test_records_before_ended_are_written(self):
         service, writers = _make_stub()
         await service.process(
@@ -235,15 +235,15 @@ async def test_records_before_ended_are_written(self):
 
 @pytest.mark.unit
 class TestClose:
-    @pytest.mark.asyncio(mode="strict")
+    @pytest.mark.asyncio
     async def test_close_closes_all_writers(self):
         service, writers = _make_stub()
         service.close()
         for writer in writers:
             assert writer.closed
         assert service.writers == []
 
-    @pytest.mark.asyncio(mode="strict")
+    @pytest.mark.asyncio
     async def test_close_idempotent(self):
         service, _ = _make_stub()
         service.close()
@@ -257,7 +257,7 @@ async def test_close_idempotent(self):
 
 @pytest.mark.unit
 class TestIntegrationWithRealWriters:
-    @pytest.mark.asyncio(mode="strict")
+    @pytest.mark.asyncio
     async def test_jsonl_writer_integration(self, tmp_path):
         """EventLoggerService with a real JSONLWriter persists records to disk."""
         writer = JSONLWriter(tmp_path / "events", flush_interval=1)
@@ -281,7 +281,7 @@ async def test_jsonl_writer_integration(self, tmp_path):
         assert records[1].event_type == SampleEventType.RECV_FIRST
         assert records[2].event_type == SampleEventType.COMPLETE
 
-    @pytest.mark.asyncio(mode="strict")
+    @pytest.mark.asyncio
     async def test_sql_writer_integration(self, tmp_path):
         """EventLoggerService with a real SQLWriter persists records to SQLite."""
         from sqlalchemy import create_engine, select
@@ -312,7 +312,7 @@ async def test_sql_writer_integration(self, tmp_path):
             ]
         engine.dispose()
 
-    @pytest.mark.asyncio(mode="strict")
+    @pytest.mark.asyncio
     async def test_dual_writer_integration(self, tmp_path):
         """Both JSONL and SQL writers receive the same records."""
         jsonl_writer = JSONLWriter(tmp_path / "events", flush_interval=1)
@@ -343,7 +343,7 @@ async def test_dual_writer_integration(self, tmp_path):
             assert rows[0].sample_uuid == "dual-1"
         engine.dispose()
 
-    @pytest.mark.asyncio(mode="strict")
+    @pytest.mark.asyncio
     async def test_ended_closes_real_writers(self, tmp_path):
         """ENDED triggers close on real writers, flushing data to disk."""
         jsonl_writer = JSONLWriter(tmp_path / "events", flush_interval=100)
@@ -361,7 +361,7 @@ async def test_ended_closes_real_writers(self, tmp_path):
         lines = [line for line in content.split("\n") if line]
         assert len(lines) == 2
 
-    @pytest.mark.asyncio(mode="strict")
+    @pytest.mark.asyncio
     async def test_events_after_ended_not_persisted_to_jsonl(self, tmp_path):
         """All events after ENDED (including errors) are dropped from JSONL."""
         writer = JSONLWriter(tmp_path / "events", flush_interval=100)
@@ -380,7 +380,7 @@ async def test_events_after_ended_not_persisted_to_jsonl(self, tmp_path):
         assert len(lines) == 1
         assert "LateError" not in lines[0]
 
-    @pytest.mark.asyncio(mode="strict")
+    @pytest.mark.asyncio
     async def test_full_lifecycle(self, tmp_path):
         """Full session lifecycle: started -> samples -> ended."""
         writer = JSONLWriter(tmp_path / "events", flush_interval=1)
@@ -418,7 +418,7 @@ async def test_full_lifecycle(self, tmp_path):
 
 @pytest.mark.unit
 class TestEdgeCases:
-    @pytest.mark.asyncio(mode="strict")
+    @pytest.mark.asyncio
     @pytest.mark.parametrize(
         "case_desc, event_enum, make_record",
         [
@@ -439,7 +439,7 @@ async def test_all_event_types_written(self, case_desc, event_enum, make_record)
         for writer in writers:
             assert len(writer.written) == len(list(event_enum))
 
-    @pytest.mark.asyncio(mode="strict")
+    @pytest.mark.asyncio
     async def test_ended_only_triggers_once(self):
         """Multiple ENDED in a batch: shutdown path runs once, second ENDED is dropped."""
         service, writers = _make_stub()
diff --git a/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py b/tests/unit/async_utils/services/metrics_aggregator/test_token_metrics.py