feat: ollama generate_from_raw uses existing event loop (generative-computing#204)

jakelorocco · tuliocoppola · commit 8374083b3890 · 2025-11-05T15:00:57.000-03:00
* feat: ollama generate_from_raw uses existing event loop

* fix: add blocking prevention mech

* fix: test issues with cache
diff --git a/mellea/backends/ollama.py b/mellea/backends/ollama.py
@@ -23,6 +23,7 @@
     get_current_event_loop,
     send_to_queue,
 )
+from mellea.helpers.event_loop_helper import _run_async_in_thread
 from mellea.helpers.fancy_logger import FancyLogger
 from mellea.stdlib.base import (
     CBlock,
@@ -404,28 +405,26 @@ def _generate_from_raw(
         # See https://github.com/ollama/ollama/blob/main/docs/faq.md#how-does-ollama-handle-concurrent-requests.
         prompts = [self.formatter.print(action) for action in actions]
 
-        async def get_response(coroutines):
+        async def get_response():
+            # Run async so that we can make use of Ollama's concurrency.
+            coroutines: list[Coroutine[Any, Any, ollama.GenerateResponse]] = []
+            for prompt in prompts:
+                co = self._async_client.generate(
+                    model=self._get_ollama_model_id(),
+                    prompt=prompt,
+                    raw=True,
+                    think=model_opts.get(ModelOption.THINKING, None),
+                    format=format.model_json_schema() if format is not None else None,
+                    options=self._make_backend_specific_and_remove(model_opts),
+                )
+                coroutines.append(co)
+
             responses = await asyncio.gather(*coroutines, return_exceptions=True)
             return responses
 
-        async_client = ollama.AsyncClient(self._base_url)
-        # Run async so that we can make use of Ollama's concurrency.
-        coroutines = []
-        for prompt in prompts:
-            co = async_client.generate(
-                model=self._get_ollama_model_id(),
-                prompt=prompt,
-                raw=True,
-                think=model_opts.get(ModelOption.THINKING, None),
-                format=format.model_json_schema() if format is not None else None,
-                options=self._make_backend_specific_and_remove(model_opts),
-            )
-            coroutines.append(co)
-
-        # Revisit this once we start using async elsewhere. Only one asyncio event
-        # loop can be running in a given thread.
-        responses: list[ollama.GenerateResponse | BaseException] = asyncio.run(
-            get_response(coroutines)
+        # Run in the same event_loop like other Mellea async code called from a sync function.
+        responses: list[ollama.GenerateResponse | BaseException] = _run_async_in_thread(
+            get_response()
         )
 
         results = []
diff --git a/mellea/helpers/event_loop_helper.py b/mellea/helpers/event_loop_helper.py
@@ -5,6 +5,8 @@
 from collections.abc import Coroutine
 from typing import Any, TypeVar
 
+from mellea.helpers.async_helpers import get_current_event_loop
+
 R = TypeVar("R")
 
 
@@ -52,6 +54,9 @@ async def finalize_tasks():
 
     def __call__(self, co: Coroutine[Any, Any, R]) -> R:
         """Runs the coroutine in the event loop."""
+        if self._event_loop == get_current_event_loop():
+            # If this gets called from the same event loop, launch in a separate thread to prevent blocking.
+            return _EventLoopHandler()(co)
         return asyncio.run_coroutine_threadsafe(co, self._event_loop).result()
 
 
diff --git a/test/backends/test_ollama.py b/test/backends/test_ollama.py
@@ -193,8 +193,7 @@ async def get_client_async():
 
     fourth_client = asyncio.run(get_client_async())
     assert fourth_client in backend._client_cache.cache.values()
-    assert second_client not in backend._client_cache.cache.values()
-
+    assert len(backend._client_cache.cache.values()) == 2
 
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/test/backends/test_openai_ollama.py b/test/backends/test_openai_ollama.py
@@ -206,7 +206,7 @@ async def get_client_async():
 
     fourth_client = asyncio.run(get_client_async())
     assert fourth_client in backend._client_cache.cache.values()
-    assert second_client not in backend._client_cache.cache.values()
+    assert len(backend._client_cache.cache.values()) == 2
 
 if __name__ == "__main__":
     import pytest
diff --git a/test/backends/test_watsonx.py b/test/backends/test_watsonx.py
@@ -167,7 +167,7 @@ async def get_client_async():
 
     fourth_client = asyncio.run(get_client_async())
     assert fourth_client in backend._client_cache.cache.values()
-    assert second_client not in backend._client_cache.cache.values()
+    assert len(backend._client_cache.cache.values()) == 2
 
 if __name__ == "__main__":
     import pytest