Merge branch 'main' into jal/top-level-async

jakelorocco · web-flow · commit bb4740e8df8d · 2025-10-01T15:10:24.000-04:00
diff --git a/.github/mergify.yml b/.github/mergify.yml
@@ -5,5 +5,5 @@ merge_protections:
       - base = main
     success_conditions:
       - "title ~=
-        ^(fix|feat|docs|style|refactor|perf|test|build|ci|chore|revert)(?:\\(.+\
+        ^(fix|feat|docs|style|refactor|perf|test|build|ci|chore|revert|release)(?:\\(.+\
         \\))?:"
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,12 @@
+## [v0.1.0](https://github.com/generative-computing/mellea/releases/tag/v0.1.0) - 2025-10-01
+
+### Feature
+
+* Add fix to watsonx and note to litellm ([#173](https://github.com/generative-computing/mellea/issues/173)) ([`307dbe1`](https://github.com/generative-computing/mellea/commit/307dbe14d430b0128e56a2ed7b735dbe93adf2a7))
+* New context, new sampling,. ([#166](https://github.com/generative-computing/mellea/issues/166)) ([`4ae6d7c`](https://github.com/generative-computing/mellea/commit/4ae6d7c23e4aff63a0887dccaf7c96bc9e50121a))
+* Add async and streaming support ([#137](https://github.com/generative-computing/mellea/issues/137)) ([`4ee56a9`](https://github.com/generative-computing/mellea/commit/4ee56a9f9e74302cf677377d6eab19e11ab0a715))
+* Best-of-N Sampling with Process Reward Models ([#118](https://github.com/generative-computing/mellea/issues/118)) ([`b18e03d`](https://github.com/generative-computing/mellea/commit/b18e03d655f18f923202acf96a49d4acafa0701d))
+
 ## [v0.0.6](https://github.com/generative-computing/mellea/releases/tag/v0.0.6) - 2025-09-18
 
 ### Feature
diff --git a/mellea/backends/huggingface.py b/mellea/backends/huggingface.py
@@ -332,6 +332,7 @@ def _generate_from_context_standard(
             input_ids = self._tokenizer.apply_chat_template(  # type: ignore
                 ctx_as_conversation,
                 tools=convert_tools_to_json(tools),  # type: ignore
+                add_generation_prompt=True,
                 return_tensors="pt",
                 **self._make_backend_specific_and_remove(model_options),
             ).to(self._device)  # type: ignore
@@ -401,6 +402,7 @@ def _generate_from_context_standard(
                 self.post_processing,
                 conversation=ctx_as_conversation,
                 input_ids=input_ids,
+                format=format,
                 tool_calls=tool_calls,
                 tools=tools,
                 seed=seed,
@@ -457,6 +459,7 @@ async def post_processing(
         self,
         mot: ModelOutputThunk,
         conversation: list[dict],
+        format: type[BaseModelSubclass] | None,
         tool_calls: bool,
         tools: dict[str, Callable],
         seed,
diff --git a/mellea/backends/litellm.py b/mellea/backends/litellm.py
@@ -53,6 +53,8 @@ def __init__(
     ):
         """Initialize and OpenAI compatible backend. For any additional kwargs that you need to pass the the client, pass them as a part of **kwargs.
 
+        Note: If getting `Unclosed client session`, set `export DISABLE_AIOHTTP_TRANSPORT=True` in your environment. See: https://github.com/BerriAI/litellm/issues/13251.
+
         Args:
             model_id : The LiteLLM model identifier. Make sure that all necessary credentials are in OS environment variables.
             formatter: A custom formatter based on backend.If None, defaults to TemplateFormatter
@@ -293,6 +295,7 @@ def _generate_from_chat_context_standard(
             conversation=conversation,
             tools=tools,
             thinking=thinking,
+            format=format,
         )
 
         try:
@@ -369,6 +372,7 @@ async def post_processing(
         conversation: list[dict],
         tools: dict[str, Callable],
         thinking,
+        format,
     ):
         """Called when generation is done."""
         # Reconstruct the chat_response from chunks if streamed.
diff --git a/mellea/backends/ollama.py b/mellea/backends/ollama.py
@@ -343,7 +343,7 @@ def generate_from_chat_context(
         # each processing step.
         output._process = functools.partial(self.processing, tools=tools)
         output._post_process = functools.partial(
-            self.post_processing, conversation=conversation, tools=tools
+            self.post_processing, conversation=conversation, tools=tools, format=format
         )
 
         try:
@@ -506,6 +506,7 @@ async def post_processing(
         mot: ModelOutputThunk,
         conversation: list[dict],
         tools: dict[str, Callable],
+        format,
     ):
         """Called when generation is done."""
         assert mot._action is not None, (
diff --git a/mellea/backends/openai.py b/mellea/backends/openai.py
@@ -502,6 +502,7 @@ def _generate_from_chat_context_standard(
             conversation=conversation,
             thinking=thinking,
             seed=model_opts.get(ModelOption.SEED, None),
+            format=format,
         )
 
         try:
@@ -569,6 +570,7 @@ async def post_processing(
         conversation: list[dict],
         thinking,
         seed,
+        format,
     ):
         """Called when generation is done."""
         # Reconstruct the chat_response from chunks if streamed.
diff --git a/mellea/backends/watsonx.py b/mellea/backends/watsonx.py
@@ -90,15 +90,15 @@ def __init__(
         if api_key is None:
             api_key = os.environ.get("WATSONX_API_KEY")
         if project_id is None:
-            project_id = os.environ.get("WATSONX_PROJECT_ID")
+            self._project_id = os.environ.get("WATSONX_PROJECT_ID")
 
         self._creds = Credentials(url=base_url, api_key=api_key)
         _client = APIClient(credentials=self._creds)
         self._model_inference = ModelInference(
             model_id=self._get_watsonx_model_id(),
             api_client=_client,
             credentials=self._creds,
-            project_id=project_id,
+            project_id=self._project_id,
             params=self.model_options,
             **kwargs,
         )
@@ -135,7 +135,14 @@ def __init__(
     @property
     def _model(self) -> ModelInference:
         """Watsonx's client gets tied to a specific event loop. Reset it here."""
-        self._model_inference.set_api_client(APIClient(self._creds))
+        _client = APIClient(credentials=self._creds)
+        self._model_inference = ModelInference(
+            model_id=self._get_watsonx_model_id(),
+            api_client=_client,
+            credentials=self._creds,
+            project_id=self._project_id,
+            params=self.model_options,
+        )
         return self._model_inference
 
     def _get_watsonx_model_id(self) -> str:
@@ -340,6 +347,7 @@ def generate_from_chat_context(
             conversation=conversation,
             tools=tools,
             seed=model_opts.get(ModelOption.SEED, None),
+            format=format,
         )
 
         try:
@@ -406,6 +414,7 @@ async def post_processing(
         conversation: list[dict],
         tools: dict[str, Callable],
         seed,
+        format,
     ):
         """Called when generation is done."""
         # Reconstruct the chat_response from chunks if streamed.
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "pdm.backend"
 
 [project]
 name = "mellea"
-version = "0.0.6"
+version = "0.1.0"
 authors = [
     { name = "Nathan Fulton", email = "nathan@ibm.com" },
     { name = "Hendrik Strobelt", email = "hendrik.strobelt@ibm.com" },
diff --git a/test/backends/test_huggingface_tools.py b/test/backends/test_huggingface_tools.py
@@ -0,0 +1,77 @@
+import pydantic
+import pytest
+from typing_extensions import Annotated
+
+from mellea import MelleaSession
+from mellea.backends.aloras.huggingface.granite_aloras import add_granite_aloras
+from mellea.backends.cache import SimpleLRUCache
+from mellea.backends.formatter import TemplateFormatter
+from mellea.backends.huggingface import LocalHFBackend
+from mellea.backends.types import ModelOption
+from mellea.stdlib.base import CBlock, ChatContext
+from mellea.stdlib.requirement import (
+    ALoraRequirement,
+    LLMaJRequirement,
+    Requirement,
+    ValidationResult,
+    default_output_to_bool,
+)
+import mellea.backends.model_ids as model_ids
+
+
+@pytest.fixture(scope="module")
+def backend():
+    """Shared HuggingFace backend for all tests in this module."""
+    backend = LocalHFBackend(
+        model_id=model_ids.MISTRALAI_MISTRAL_0_3_7B,
+        cache=SimpleLRUCache(5),
+    )
+    # add_granite_aloras(backend)
+    return backend
+
+
+@pytest.fixture(scope="function")
+def session(backend):
+    """Fresh HuggingFace session for each test."""
+    session = MelleaSession(backend, ctx=ChatContext())
+    yield session
+    session.reset()
+
+
+
+@pytest.mark.qualitative
+def test_tool(session):
+
+    tool_call_history = []
+    def get_temperature(location: str) -> int:
+        """Returns today's temperature of the given city in Celsius.
+
+        Args:
+            location: a city name.
+        """
+        tool_call_history.append(location)
+        return 21
+
+    output = session.instruct(
+        "What is today's temperature in Boston? Answer in Celsius. Reply the number only.",
+        model_options={
+            ModelOption.TOOLS: [get_temperature,],
+            ModelOption.MAX_NEW_TOKENS: 1000,
+        },
+        tool_calls = True,
+    )
+
+    assert output.tool_calls is not None
+
+    result = output.tool_calls["get_temperature"].call_func()
+    print(result)
+
+    assert len(tool_call_history) > 0
+    assert tool_call_history[0].lower() == "boston"
+    assert 21 == result
+
+
+if __name__ == "__main__":
+    import pytest
+
+    pytest.main([__file__])
diff --git a/uv.lock b/uv.lock