Improve code completion performance (ThreadedCompleted)

jonathanslenders · jonathanslenders · commit 92f188ae03c9 · 2022-12-06T23:09:15.000+01:00
Fixes in the `generator_to_async_generator` (back-pressure and performance
improvements) and `ThreadedCompleter`, where this is used.
diff --git a/src/prompt_toolkit/completion/base.py b/src/prompt_toolkit/completion/base.py
@@ -1,10 +1,14 @@
 """
 """
 from abc import ABCMeta, abstractmethod
-from typing import AsyncGenerator, Callable, Iterable, Optional, Sequence
+from typing import AsyncGenerator, Callable, Iterable, List, Optional, Sequence
 
 from prompt_toolkit.document import Document
-from prompt_toolkit.eventloop import generator_to_async_generator
+from prompt_toolkit.eventloop import (
+    aclosing,
+    generator_to_async_generator,
+    get_event_loop,
+)
 from prompt_toolkit.filters import FilterOrBool, to_filter
 from prompt_toolkit.formatted_text import AnyFormattedText, StyleAndTextTuples
 
@@ -224,10 +228,61 @@ async def get_completions_async(
         """
         Asynchronous generator of completions.
         """
-        async for completion in generator_to_async_generator(
-            lambda: self.completer.get_completions(document, complete_event)
-        ):
-            yield completion
+        # NOTE: Right now, we are consuming the `get_completions` generator in
+        #       a synchronous background thread, then passing the results one
+        #       at a time over a queue, and consuming this queue in the main
+        #       thread (that's what `generator_to_async_generator` does). That
+        #       means that if the completer is *very* slow, we'll be showing
+        #       completions in the UI once they are computed.
+
+        #       It's very tempting to replace this implementation with the
+        #       commented code below for several reasons:
+
+        #       - `generator_to_async_generator` is not perfect and hard to get
+        #         right. It's a lot of complexity for little gain. The
+        #         implementation needs a huge buffer for it to be efficient
+        #         when there are many completions (like 50k+).
+        #       - Normally, a completer is supposed to be fast, users can have
+        #         "complete while typing" enabled, and want to see the
+        #         completions within a second. Handling one completion at a
+        #         time, and rendering once we get it here doesn't make any
+        #         sense if this is quick anyway.
+        #       - Completers like `FuzzyCompleter` prepare all completions
+        #         anyway so that they can be sorted by accuracy before they are
+        #         yielded. At the point that we start yielding completions
+        #         here, we already have all completions.
+        #       - The `Buffer` class has complex logic to invalidate the UI
+        #         while it is consuming the completions. We don't want to
+        #         invalidate the UI for every completion (if there are many),
+        #         but we want to do it often enough so that completions are
+        #         being displayed while they are produced.
+
+        #       We keep the current behavior mainly for backward-compatibility.
+        #       Similarly, it would be better for this function to not return
+        #       an async generator, but simply be a coroutine that returns a
+        #       list of `Completion` objects, containing all completions at
+        #       once.
+
+        #       Note that this argument doesn't mean we shouldn't use
+        #       `ThreadedCompleter`. It still makes sense to produce
+        #       completions in a background thread, because we don't want to
+        #       freeze the UI while the user is typing. But sending the
+        #       completions one at a time to the UI maybe isn't worth it.
+
+        # def get_all_in_thread() -> List[Completion]:
+        #   return list(self.get_completions(document, complete_event))
+
+        # completions = await get_event_loop().run_in_executor(None, get_all_in_thread)
+        # for completion in completions:
+        #   yield completion
+
+        async with aclosing(
+            generator_to_async_generator(
+                lambda: self.completer.get_completions(document, complete_event)
+            )
+        ) as async_generator:
+            async for completion in async_generator:
+                yield completion
 
     def __repr__(self) -> str:
         return f"ThreadedCompleter({self.completer!r})"
@@ -306,10 +361,11 @@ async def get_completions_async(
 
         # Get all completions in a non-blocking way.
         if self.filter():
-            async for item in self.completer.get_completions_async(
-                document, complete_event
-            ):
-                yield item
+            async with aclosing(
+                self.completer.get_completions_async(document, complete_event)
+            ) as async_generator:
+                async for item in async_generator:
+                    yield item
 
 
 class _MergedCompleter(Completer):
@@ -333,8 +389,11 @@ async def get_completions_async(
 
         # Get all completions from the other completers in a non-blocking way.
         for completer in self.completers:
-            async for item in completer.get_completions_async(document, complete_event):
-                yield item
+            async with aclosing(
+                completer.get_completions_async(document, complete_event)
+            ) as async_generator:
+                async for item in async_generator:
+                    yield item
 
 
 def merge_completers(
diff --git a/src/prompt_toolkit/eventloop/__init__.py b/src/prompt_toolkit/eventloop/__init__.py
@@ -1,4 +1,4 @@
-from .async_generator import generator_to_async_generator
+from .async_generator import aclosing, generator_to_async_generator
 from .inputhook import (
     InputHookContext,
     InputHookSelector,
@@ -15,6 +15,7 @@
 __all__ = [
     # Async generator
     "generator_to_async_generator",
+    "aclosing",
     # Utils.
     "run_in_executor_with_context",
     "call_soon_threadsafe",
diff --git a/src/prompt_toolkit/eventloop/async_generator.py b/src/prompt_toolkit/eventloop/async_generator.py
@@ -1,16 +1,62 @@
 """
 Implementation for async generators.
 """
-from asyncio import Queue
-from typing import AsyncGenerator, Callable, Iterable, TypeVar, Union
+from contextlib import asynccontextmanager
+from queue import Empty, Full, Queue
+from threading import Event
+from typing import (
+    TYPE_CHECKING,
+    AsyncGenerator,
+    Awaitable,
+    Callable,
+    Iterable,
+    TypeVar,
+    Union,
+)
 
 from .utils import get_event_loop, run_in_executor_with_context
 
 __all__ = [
+    "aclosing",
     "generator_to_async_generator",
 ]
 
 
+if TYPE_CHECKING:
+    # Thanks: https://github.com/python/typeshed/blob/main/stdlib/contextlib.pyi
+    from typing_extensions import Protocol
+
+    class _SupportsAclose(Protocol):
+        def aclose(self) -> Awaitable[object]:
+            ...
+
+    _SupportsAcloseT = TypeVar("_SupportsAcloseT", bound=_SupportsAclose)
+
+
+@asynccontextmanager
+async def aclosing(
+    thing: "_SupportsAcloseT",
+) -> AsyncGenerator["_SupportsAcloseT", None]:
+    "Similar to `contextlib.aclosing`, in Python 3.10."
+    try:
+        yield thing
+    finally:
+        await thing.aclose()
+
+
+# By default, choose a buffer size that's a good balance between having enough
+# throughput, but not consuming too much memory. We use this to consume a sync
+# generator of completions as an async generator. If the queue size is very
+# small (like 1), consuming the completions goes really slow (when there are a
+# lot of items). If the queue size would be unlimited or too big, this can
+# cause overconsumption of memory, and cause CPU time spent producing items
+# that are no longer needed (if the consumption of the async generator stops at
+# some point). We need a fixed size in order to get some back pressure from the
+# async consumer to the sync producer. We choose 1000 by default here. If we
+# have around 50k completions, measurements show that 1000 is still
+# significantly faster than a buffer of 100.
+DEFAULT_BUFFER_SIZE: int = 1000
+
 _T = TypeVar("_T")
 
 
@@ -19,7 +65,8 @@ class _Done:
 
 
 async def generator_to_async_generator(
-    get_iterable: Callable[[], Iterable[_T]]
+    get_iterable: Callable[[], Iterable[_T]],
+    buffer_size: int = DEFAULT_BUFFER_SIZE,
 ) -> AsyncGenerator[_T, None]:
     """
     Turn a generator or iterable into an async generator.
@@ -28,10 +75,12 @@ async def generator_to_async_generator(
 
     :param get_iterable: Function that returns a generator or iterable when
         called.
+    :param buffer_size: Size of the queue between the async consumer and the
+        synchronous generator that produces items.
     """
     quitting = False
-    _done = _Done()
-    q: Queue[Union[_T, _Done]] = Queue()
+    # NOTE: We are limiting the queue size in order to have back-pressure.
+    q: Queue[Union[_T, _Done]] = Queue(maxsize=buffer_size)
     loop = get_event_loop()
 
     def runner() -> None:
@@ -44,19 +93,38 @@ def runner() -> None:
                 # When this async generator was cancelled (closed), stop this
                 # thread.
                 if quitting:
-                    break
-
-                loop.call_soon_threadsafe(q.put_nowait, item)
+                    return
+
+                while True:
+                    try:
+                        q.put(item, timeout=1)
+                    except Full:
+                        if quitting:
+                            return
+                        continue
+                    else:
+                        break
 
         finally:
-            loop.call_soon_threadsafe(q.put_nowait, _done)
+            while True:
+                try:
+                    q.put(_Done(), timeout=1)
+                except Full:
+                    if quitting:
+                        return
+                    continue
+                else:
+                    break
 
     # Start background thread.
     runner_f = run_in_executor_with_context(runner)
 
     try:
         while True:
-            item = await q.get()
+            try:
+                item = q.get_nowait()
+            except Empty:
+                item = await loop.run_in_executor(None, q.get)
             if isinstance(item, _Done):
                 break
             else:
@@ -67,8 +135,5 @@ def runner() -> None:
         quitting = True
 
         # Wait for the background thread to finish. (should happen right after
-        # the next item is yielded). If we don't do this, and the event loop
-        # gets closed before the runner is done, then we'll get a
-        # `RuntimeError: Event loop is closed` exception printed to stdout that
-        # we can't handle.
+        # the last item is yielded).
         await runner_f