From 9498f0fac93af200f1dbedd79a15481721276013 Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Fri, 22 Aug 2025 10:06:51 -0700 Subject: [PATCH 01/23] adding conftest.py for test configs --- test/conftest.py | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 test/conftest.py diff --git a/test/conftest.py b/test/conftest.py new file mode 100644 index 00000000..a4ac821f --- /dev/null +++ b/test/conftest.py @@ -0,0 +1,48 @@ +import os + +import pytest + +from mellea.backends.huggingface import LocalHFBackend +from mellea.backends.ollama import OllamaModelBackend +from mellea.backends.openai import OpenAIBackend +from mellea.stdlib.session import MelleaSession + + +@pytest.fixture(scope="session") +def gh_run() -> int: + return int(os.environ.get("GITHUB_ACTION", 0)) # type: ignore + + +def pytest_runtest_setup(item): + # Runs tests *not* marked with `@pytest.mark.llm` to run normally. + if not item.get_closest_marker("llm"): + return + + gh_run = int(os.environ.get("GITHUB_ACTION", 0)) + + if gh_run == 1: + pytest.skip( + reason="Skipping LLM test: got env variable GITHUB_ACTION == 1. Used only in gh workflows." + ) + + # # Check if there is a session fixture. + # try: + # session: MelleaSession = item._request.getfixturevalue("m_session") + # except Exception: + # # Skip test cause all llm marked tests need a session fixture. + # pytest.skip("`llm` marked tests requires a `m_session` fixture.") + # # Get the Ollama name. + # if isinstance(session.backend, OllamaModelBackend) or isinstance(session.backend, OpenAIBackend): + # model_id = session.backend.model_id.ollama_name + # # Skip tests of the model name is llama 1b + # if model_id == "llama3.2:1b": + # pytest.skip( + # "Skipping LLM test: got model_id == llama3.2:1b in ollama. Used only in gh workflows." + # ) + # elif isinstance(session.backend, LocalHFBackend): + # model_id = session.backend.model_id.hf_model_name + # # Skip tests of the model name is llama 1b + # if model_id == "unsloth/Llama-3.2-1B": + # pytest.skip( + # "Skipping LLM test: got model_id == unsloth/Llama-3.2-1B in hf. Used only in gh workflows." + # ) From 68cf8c6811398e6f6ea024d4bbeeca4ec671f7fd Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Mon, 25 Aug 2025 09:30:38 -0700 Subject: [PATCH 02/23] adding modified tests with optional LLM runs --- test/backends/test_huggingface.py | 23 ++--- test/backends/test_ollama.py | 31 ++++--- test/backends/test_openai_ollama.py | 48 +++++++---- test/backends/test_watsonx.py | 22 +++-- test/conftest.py | 2 +- test/stdlib_basics/test_base.py | 3 +- test/stdlib_basics/test_chat_view.py | 5 +- test/stdlib_basics/test_contextual_session.py | 85 +++++++++++-------- test/stdlib_basics/test_genslot.py | 2 +- test/stdlib_basics/test_session.py | 15 +++- 10 files changed, 142 insertions(+), 94 deletions(-) diff --git a/test/backends/test_huggingface.py b/test/backends/test_huggingface.py index 625f22d3..f91711c2 100644 --- a/test/backends/test_huggingface.py +++ b/test/backends/test_huggingface.py @@ -21,6 +21,7 @@ @pytest.fixture(scope="module") def backend(): """Shared HuggingFace backend for all tests in this module.""" + # TODO: find a smalle 1B model to do Alora stuff on github actions. backend = LocalHFBackend( model_id="ibm-granite/granite-3.2-8b-instruct", formatter=TemplateFormatter(model_id="ibm-granite/granite-4.0-tiny-preview"), @@ -37,7 +38,7 @@ def session(backend): yield session session.reset() - +@pytest.mark.llm def test_system_prompt(session): result = session.chat( "Where are we going?", @@ -45,7 +46,7 @@ def test_system_prompt(session): ) print(result) - +@pytest.mark.llm def test_constraint_alora(session, backend): answer = session.instruct( "Corporate wants you to find the difference between these two strings: aaaaaaaaaa aaaaabaaaa. Be concise and don't write code to answer the question.", @@ -63,7 +64,7 @@ def test_constraint_alora(session, backend): ) assert alora_output in ["Y", "N"], alora_output - +@pytest.mark.llm def test_constraint_lora_with_requirement(session, backend): answer = session.instruct( "Corporate wants you to find the difference between these two strings: aaaaaaaaaa aaaaabaaaa" @@ -79,7 +80,7 @@ def test_constraint_lora_with_requirement(session, backend): assert isinstance(val_result, ValidationResult) assert str(val_result.reason) in ["Y", "N"] - +@pytest.mark.llm def test_constraint_lora_override(session, backend): backend.default_to_constraint_checking_alora = False # type: ignore answer = session.instruct( @@ -94,7 +95,7 @@ def test_constraint_lora_override(session, backend): assert isinstance(default_output_to_bool(str(val_result.reason)), bool) backend.default_to_constraint_checking_alora = True - +@pytest.mark.llm def test_constraint_lora_override_does_not_override_alora(session, backend): backend.default_to_constraint_checking_alora = False # type: ignore answer = session.instruct( @@ -111,7 +112,7 @@ def test_constraint_lora_override_does_not_override_alora(session, backend): assert str(val_result.reason) in ["Y", "N"] backend.default_to_constraint_checking_alora = True - +@pytest.mark.llm def test_llmaj_req_does_not_use_alora(session, backend): backend.default_to_constraint_checking_alora = True # type: ignore answer = session.instruct( @@ -127,12 +128,12 @@ def test_llmaj_req_does_not_use_alora(session, backend): assert isinstance(val_result, ValidationResult) assert str(val_result.reason) not in ["Y", "N"] - +@pytest.mark.llm def test_instruct(session): result = session.instruct("Compute 1+1.") print(result) - +@pytest.mark.llm def test_multiturn(session): session.instruct("Compute 1+1") beta = session.instruct( @@ -142,7 +143,7 @@ def test_multiturn(session): words = session.instruct("Now list five English words that start with that letter.") print(words) - +@pytest.mark.llm def test_format(session): class Person(pydantic.BaseModel): name: str @@ -172,7 +173,7 @@ class Email(pydantic.BaseModel): "The email address should be at example.com" ) - +@pytest.mark.llm def test_generate_from_raw(session): prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"] @@ -182,7 +183,7 @@ def test_generate_from_raw(session): assert len(results) == len(prompts) - +@pytest.mark.llm def test_generate_from_raw_with_format(session): prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"] diff --git a/test/backends/test_ollama.py b/test/backends/test_ollama.py index 78b07de4..fdba285a 100644 --- a/test/backends/test_ollama.py +++ b/test/backends/test_ollama.py @@ -1,11 +1,13 @@ -from mellea import start_session, SimpleContext -from mellea.stdlib.base import CBlock -from mellea.stdlib.requirement import Requirement -import pydantic import json + +import pydantic +import pytest from typing_extensions import Annotated + +from mellea import SimpleContext, start_session from mellea.backends.types import ModelOption -import pytest +from mellea.stdlib.base import CBlock +from mellea.stdlib.requirement import Requirement @pytest.fixture(scope="function") @@ -15,6 +17,8 @@ def session(): yield session session.reset() + +@pytest.mark.llm def test_simple_instruct(session): result = session.instruct( "Write an email to Hendrik trying to sell him self-sealing stembolts." @@ -23,6 +27,8 @@ def test_simple_instruct(session): assert "chat_response" in result._meta assert result._meta["chat_response"].message.role == "assistant" + +@pytest.mark.llm def test_instruct_with_requirement(session): response = session.instruct( "Write an email to Hendrik convincing him to buy some self-sealing stembolts." @@ -45,12 +51,14 @@ def test_instruct_with_requirement(session): ) print(results) +@pytest.mark.llm def test_chat(session): output_message = session.chat("What is 1+1?") - assert ( - "2" in output_message.content - ), f"Expected a message with content containing 2 but found {output_message}" + assert "2" in output_message.content, ( + f"Expected a message with content containing 2 but found {output_message}" + ) +@pytest.mark.llm def test_format(session): class Person(pydantic.BaseModel): name: str @@ -83,6 +91,7 @@ class Email(pydantic.BaseModel): # assert email.to.email_address.endswith("example.com") pass +@pytest.mark.llm def test_generate_from_raw(session): prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"] @@ -113,9 +122,9 @@ class Answer(pydantic.BaseModel): try: answer = Answer.model_validate_json(random_result.value) except pydantic.ValidationError as e: - assert ( - False - ), f"formatting directive failed for {random_result.value}: {e.json()}" + assert False, ( + f"formatting directive failed for {random_result.value}: {e.json()}" + ) if __name__ == "__main__": diff --git a/test/backends/test_openai_ollama.py b/test/backends/test_openai_ollama.py index 5bf3b5f3..200b539d 100644 --- a/test/backends/test_openai_ollama.py +++ b/test/backends/test_openai_ollama.py @@ -1,41 +1,52 @@ # test/rits_backend_tests/test_openai_integration.py +import pydantic +import pytest +from typing_extensions import Annotated + from mellea import MelleaSession -from mellea.stdlib.base import CBlock, LinearContext, ModelOutputThunk -from mellea.backends.openai import OpenAIBackend from mellea.backends.formatter import TemplateFormatter +from mellea.backends.model_ids import META_LLAMA_3_2_1B +from mellea.backends.openai import OpenAIBackend from mellea.backends.types import ModelOption - -import pydantic -from typing_extensions import Annotated -import pytest +from mellea.stdlib.base import CBlock, LinearContext, ModelOutputThunk @pytest.fixture(scope="module") -def backend(): +def backend(gh_run: int): """Shared OpenAI backend configured for Ollama.""" - return OpenAIBackend( - model_id="granite3.3:8b", - formatter=TemplateFormatter(model_id="ibm-granite/granite-3.2-8b-instruct"), + if gh_run == 1: + return OpenAIBackend( + model_id=META_LLAMA_3_2_1B, + formatter=TemplateFormatter(model_id=META_LLAMA_3_2_1B), base_url="http://localhost:11434/v1", api_key="ollama", ) + else: + return OpenAIBackend( + model_id="granite3.3:8b", + formatter=TemplateFormatter(model_id="ibm-granite/granite-3.2-8b-instruct"), + base_url="http://localhost:11434/v1", + api_key="ollama", + ) @pytest.fixture(scope="function") -def session(backend): +def m_session(backend): """Fresh OpenAI session for each test.""" session = MelleaSession(backend, ctx=LinearContext(is_chat_context=True)) yield session session.reset() -def test_instruct(session): - result = session.instruct("Compute 1+1.") +@pytest.mark.llm +def test_instruct(m_session): + result = m_session.instruct("Compute 1+1.") assert isinstance(result, ModelOutputThunk) assert "2" in result.value # type: ignore -def test_multiturn(session): - session.instruct("What is the capital of France?") - answer = session.instruct("Tell me the answer to the previous question.") +@pytest.mark.llm +def test_multiturn(m_session): + m_session.instruct("What is the capital of France?") + answer = m_session.instruct("Tell me the answer to the previous question.") assert "Paris" in answer.value # type: ignore # def test_api_timeout_error(self): @@ -53,7 +64,8 @@ def test_multiturn(session): # assert "granite3.3:8b" in result.value # self.m.reset() -def test_format(session): +@pytest.mark.llm +def test_format(m_session): class Person(pydantic.BaseModel): name: str # it does not support regex patterns in json schema @@ -68,7 +80,7 @@ class Email(pydantic.BaseModel): subject: str body: str - output = session.instruct( + output = m_session.instruct( "Write a short email to Olivia, thanking her for organizing a sailing activity. Her email server is example.com. No more than two sentences. ", format=Email, model_options={ModelOption.MAX_NEW_TOKENS: 2**8}, diff --git a/test/backends/test_watsonx.py b/test/backends/test_watsonx.py index b62def03..af8634b6 100644 --- a/test/backends/test_watsonx.py +++ b/test/backends/test_watsonx.py @@ -1,14 +1,15 @@ # test/rits_backend_tests/test_watsonx_integration.py import os -from mellea import MelleaSession -from mellea.stdlib.base import CBlock, LinearContext, ModelOutputThunk -from mellea.backends.watsonx import WatsonxAIBackend -from mellea.backends.formatter import TemplateFormatter -from mellea.backends.types import ModelOption import pydantic -from typing_extensions import Annotated import pytest +from typing_extensions import Annotated + +from mellea import MelleaSession +from mellea.backends.formatter import TemplateFormatter +from mellea.backends.types import ModelOption +from mellea.backends.watsonx import WatsonxAIBackend +from mellea.stdlib.base import CBlock, LinearContext, ModelOutputThunk @pytest.fixture(scope="module") @@ -28,18 +29,21 @@ def session(backend): session.reset() - - +@pytest.mark.llm def test_instruct(session): result = session.instruct("Compute 1+1.") assert isinstance(result, ModelOutputThunk) assert "2" in result.value # type: ignore + +@pytest.mark.llm def test_multiturn(session): session.instruct("What is the capital of France?") answer = session.instruct("Tell me the answer to the previous question.") assert "Paris" in answer.value # type: ignore + +@pytest.mark.llm def test_format(session): class Person(pydantic.BaseModel): name: str @@ -72,6 +76,8 @@ class Email(pydantic.BaseModel): # assert email.to.email_address.endswith("example.com") pass + +@pytest.mark.llm def test_generate_from_raw(session): prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"] diff --git a/test/conftest.py b/test/conftest.py index a4ac821f..e81414f2 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -21,7 +21,7 @@ def pytest_runtest_setup(item): gh_run = int(os.environ.get("GITHUB_ACTION", 0)) if gh_run == 1: - pytest.skip( + pytest.xfail( reason="Skipping LLM test: got env variable GITHUB_ACTION == 1. Used only in gh workflows." ) diff --git a/test/stdlib_basics/test_base.py b/test/stdlib_basics/test_base.py index 9ec99db6..6d4008bd 100644 --- a/test/stdlib_basics/test_base.py +++ b/test/stdlib_basics/test_base.py @@ -1,5 +1,4 @@ -from mellea.stdlib.base import Component, CBlock -from mellea.stdlib.base import LinearContext +from mellea.stdlib.base import CBlock, Component, LinearContext def test_cblock(): diff --git a/test/stdlib_basics/test_chat_view.py b/test/stdlib_basics/test_chat_view.py index c0d9e9eb..c56b7c2e 100644 --- a/test/stdlib_basics/test_chat_view.py +++ b/test/stdlib_basics/test_chat_view.py @@ -1,7 +1,8 @@ import pytest -from mellea.stdlib.base import ModelOutputThunk, LinearContext -from mellea.stdlib.chat import as_chat_history, Message + +from mellea.stdlib.base import LinearContext, ModelOutputThunk +from mellea.stdlib.chat import Message, as_chat_history from mellea.stdlib.session import start_session diff --git a/test/stdlib_basics/test_contextual_session.py b/test/stdlib_basics/test_contextual_session.py index 97699831..4bff317f 100644 --- a/test/stdlib_basics/test_contextual_session.py +++ b/test/stdlib_basics/test_contextual_session.py @@ -1,10 +1,21 @@ -import pytest from typing import Literal -from mellea import generative, start_session, instruct, chat, validate, query, transform + +import pytest + +from mellea import chat, generative, instruct, query, start_session, transform, validate +from mellea.backends.model_ids import IBM_GRANITE_3_3_8B, META_LLAMA_3_2_1B from mellea.stdlib.base import ModelOutputThunk -from mellea.stdlib.session import get_session, MelleaSession -from mellea.stdlib.mify import mify, MifiedProtocol +from mellea.stdlib.mify import MifiedProtocol, mify from mellea.stdlib.requirement import req +from mellea.stdlib.session import MelleaSession, get_session + + +@pytest.fixture(scope="module") +def model_id(gh_run: int): + if gh_run == 1: + return META_LLAMA_3_2_1B + else: + return IBM_GRANITE_3_3_8B @generative @@ -26,9 +37,9 @@ def get_info(self) -> str: return f"{self.name} is {self.age} years old" -def test_basic_contextual_session(): +def test_basic_contextual_session(model_id): """Test basic contextual session usage with convenience functions.""" - with start_session(): + with start_session(model_id=model_id): # Test instruct result = instruct("Say hello") assert isinstance(result, ModelOutputThunk) @@ -51,9 +62,9 @@ def test_no_active_session_error(): chat("test") -def test_generative_with_contextual_session(): +def test_generative_with_contextual_session(model_id): """Test generative slots work with contextual sessions.""" - with start_session(): + with start_session(model_id=model_id): # Test without explicit session parameter result = classify_sentiment(text="I love this!") assert result in ["positive", "negative"] @@ -63,18 +74,18 @@ def test_generative_with_contextual_session(): assert isinstance(summary, str) assert len(summary) > 0 - -def test_generative_backward_compatibility(): +@pytest.mark.llm +def test_generative_backward_compatibility(model_id): """Test that generative slots still work with explicit session parameter.""" - with start_session() as m: + with start_session(model_id=model_id) as m: # Test old pattern still works result = classify_sentiment(m, text="I love this!") assert result in ["positive", "negative"] -def test_mify_with_contextual_session(): +def test_mify_with_contextual_session(model_id): """Test mify functionality with contextual sessions.""" - with start_session(): + with start_session(model_id=model_id): person = TestPerson("Alice", 30) assert isinstance(person, MifiedProtocol) @@ -88,13 +99,13 @@ def test_mify_with_contextual_session(): assert transform_result is not None -def test_nested_sessions(): +def test_nested_sessions(model_id): """Test nested sessions behavior.""" - with start_session() as outer_session: + with start_session(model_id=model_id) as outer_session: outer_result = instruct("outer session test") assert isinstance(outer_result, ModelOutputThunk) - with start_session() as inner_session: + with start_session(model_id=model_id) as inner_session: # Inner session should be active current_session = get_session() assert current_session is inner_session @@ -107,10 +118,10 @@ def test_nested_sessions(): assert current_session is outer_session -def test_session_cleanup(): +def test_session_cleanup(model_id): """Test session cleanup after context exit.""" session_ref = None - with start_session() as m: + with start_session(model_id=model_id) as m: session_ref = m instruct("test during session") @@ -119,19 +130,19 @@ def test_session_cleanup(): get_session() # Session should have been cleaned up - assert hasattr(session_ref, 'ctx') + assert hasattr(session_ref, "ctx") -def test_all_convenience_functions(): +def test_all_convenience_functions(model_id): """Test all convenience functions work within contextual session.""" - with start_session(): + with start_session(model_id=model_id): # Test instruct instruct_result = instruct("Generate a greeting") assert isinstance(instruct_result, ModelOutputThunk) # Test chat chat_result = chat("Hello there") - assert hasattr(chat_result, 'content') + assert hasattr(chat_result, "content") # Test validate validation = validate([req("The response should be positive")]) @@ -147,18 +158,18 @@ def test_all_convenience_functions(): assert transform_result is not None -def test_session_with_parameters(): +def test_session_with_parameters(model_id): """Test contextual session with custom parameters.""" - with start_session(backend_name="ollama", model_id="granite3.3:8b") as m: + with start_session(backend_name="ollama", model_id=META_LLAMA_3_2_1B) as m: result = instruct("test with parameters") assert isinstance(result, ModelOutputThunk) assert isinstance(m, MelleaSession) -def test_multiple_sequential_sessions(): +def test_multiple_sequential_sessions(model_id): """Test multiple sequential contextual sessions.""" # First session - with start_session(): + with start_session(model_id=model_id): result1 = instruct("first session") assert isinstance(result1, ModelOutputThunk) @@ -167,14 +178,14 @@ def test_multiple_sequential_sessions(): get_session() # Second session - with start_session(): + with start_session(model_id=model_id): result2 = instruct("second session") assert isinstance(result2, ModelOutputThunk) -def test_contextual_session_with_mified_object_methods(): +def test_contextual_session_with_mified_object_methods(model_id): """Test that mified objects work properly within contextual sessions.""" - with start_session(): + with start_session(model_id=model_id): person = TestPerson("Bob", 25) # Test that mified object methods work @@ -187,12 +198,12 @@ def test_contextual_session_with_mified_object_methods(): # Test format_for_llm llm_format = person.format_for_llm() assert llm_format is not None - assert hasattr(llm_format, 'args') + assert hasattr(llm_format, "args") -def test_session_methods_with_mified_objects(): +def test_session_methods_with_mified_objects(model_id): """Test using session query/transform methods with mified objects.""" - with start_session() as m: + with start_session(model_id=model_id) as m: person = TestPerson("Charlie", 35) # Test session query method @@ -205,11 +216,11 @@ def test_session_methods_with_mified_objects(): assert transform_result is not None # Verify mified objects have query/transform object creation methods - assert hasattr(person, 'get_query_object') - assert hasattr(person, 'get_transform_object') - assert hasattr(person, '_query_type') - assert hasattr(person, '_transform_type') + assert hasattr(person, "get_query_object") + assert hasattr(person, "get_transform_object") + assert hasattr(person, "_query_type") + assert hasattr(person, "_transform_type") if __name__ == "__main__": - pytest.main([__file__]) \ No newline at end of file + pytest.main([__file__]) diff --git a/test/stdlib_basics/test_genslot.py b/test/stdlib_basics/test_genslot.py index 8aa51828..58cc974a 100644 --- a/test/stdlib_basics/test_genslot.py +++ b/test/stdlib_basics/test_genslot.py @@ -33,7 +33,7 @@ def test_func(session): write_email_component = write_me_an_email(session) assert isinstance(write_email_component, str) - +@pytest.mark.llm def test_sentiment_output(classify_sentiment_output): assert classify_sentiment_output in ["positive", "negative"] diff --git a/test/stdlib_basics/test_session.py b/test/stdlib_basics/test_session.py index 351c08d8..7c592ac4 100644 --- a/test/stdlib_basics/test_session.py +++ b/test/stdlib_basics/test_session.py @@ -1,4 +1,5 @@ import pytest + from mellea.stdlib.base import ModelOutputThunk from mellea.stdlib.session import start_session @@ -10,13 +11,21 @@ def test_start_session_watsonx(): assert response.value is not None -def test_start_session_openai_with_kwargs(): - m = start_session( +def test_start_session_openai_with_kwargs(gh_run): + if gh_run == 1: + m = start_session( "openai", - model_id="granite3.3:8b", + model_id="llama3.2:1b", base_url="http://localhost:11434/v1", api_key="ollama", ) + else: + m = start_session( + "openai", + model_id="granite3.3:8b", + base_url="http://localhost:11434/v1", + api_key="ollama", + ) response = m.instruct("testing") assert isinstance(response, ModelOutputThunk) assert response.value is not None From b3189866fbb0ca79a4aebd12460ea1e9b4ccd80c Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Mon, 25 Aug 2025 09:30:59 -0700 Subject: [PATCH 03/23] adding llama 1b in model ids --- mellea/backends/model_ids.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mellea/backends/model_ids.py b/mellea/backends/model_ids.py index 40db617f..a750998c 100644 --- a/mellea/backends/model_ids.py +++ b/mellea/backends/model_ids.py @@ -89,6 +89,10 @@ class ModelIdentifier: ollama_name="llama-guard3:1b", hf_model_name="unsloth/Llama-Guard-3-1B" ) +META_LLAMA_3_2_1B = ModelIdentifier( + ollama_name="llama3.2:1b", hf_model_name="unsloth/Llama-3.2-1B" +) + ######################## #### Mistral models #### ######################## From 733b53c6da76f6dfe7e090b108e84b298693c11a Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Mon, 25 Aug 2025 09:31:40 -0700 Subject: [PATCH 04/23] adding tests to config and workflow --- .github/workflows/run tests.yaml | 41 ++++++++++++++++++++++++++++++++ pyproject.toml | 5 ++++ 2 files changed, 46 insertions(+) create mode 100644 .github/workflows/run tests.yaml diff --git a/.github/workflows/run tests.yaml b/.github/workflows/run tests.yaml new file mode 100644 index 00000000..3a7b28ac --- /dev/null +++ b/.github/workflows/run tests.yaml @@ -0,0 +1,41 @@ +name: Test non-llm components + +on: + workflow_run: + workflows: ["Verify Code Quality"] + types: + - completed + +concurrency: + group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.event.pull_request.number || github.ref_name }} + cancel-in-progress: true + +jobs: + tests: + if: ${{ github.event.workflow_run.conclusion == 'success' }} + runs-on: ubuntu-latest + timeout-minutes: 30 + strategy: + matrix: + python-version: ['3.10', '3.11', '3.12'] # Need to add 3.13 once we resolve outlines issues. + env: + GITHUB_ACTIONS: 1 + + steps: + - uses: actions/checkout@v4 + - name: Install uv and set the python version + uses: astral-sh/setup-uv@v5 + with: + python-version: ${{ matrix.python-version }} + enable-cache: true + - name: Install dependencies + run: uv sync --frozen --all-extras --group dev + - name: Install Ollama + run: curl -fsSL https://ollama.com/install.sh | sh + - name: Pull Llama 3.2:1b model + run: ollama pull llama3.2:1b + - name: Start serving the model + run: nohup ollama serve & + + - name: Run Tests + run: uv run -m pytest -v test diff --git a/pyproject.toml b/pyproject.toml index 1548e1d9..3eebf620 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -161,3 +161,8 @@ skip = 'requirements.txt,uv.lock' [tool.mypy] disable_error_code = ["empty-body", "import-untyped"] python_version = "3.10" + +[tool.pytest.ini_options] +markers = [ + "llm: Marks the test as needing an exact output from an LLM (deselect with '-m \" not llm\"'); this depends on the session.backend.model_id" +] \ No newline at end of file From 98661573cb84ba101d4e98e9fc72326235670a24 Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Mon, 25 Aug 2025 09:43:07 -0700 Subject: [PATCH 05/23] renaming workflow --- .github/workflows/{run tests.yaml => run_llm_tests.yaml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/workflows/{run tests.yaml => run_llm_tests.yaml} (100%) diff --git a/.github/workflows/run tests.yaml b/.github/workflows/run_llm_tests.yaml similarity index 100% rename from .github/workflows/run tests.yaml rename to .github/workflows/run_llm_tests.yaml From 1eccc3327c1bef83f9db6135be706358b2b72025 Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Mon, 25 Aug 2025 10:03:49 -0700 Subject: [PATCH 06/23] small changes --- .github/workflows/run_llm_tests.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/run_llm_tests.yaml b/.github/workflows/run_llm_tests.yaml index 3a7b28ac..55f80fb9 100644 --- a/.github/workflows/run_llm_tests.yaml +++ b/.github/workflows/run_llm_tests.yaml @@ -2,9 +2,9 @@ name: Test non-llm components on: workflow_run: - workflows: ["Verify Code Quality"] - types: - - completed + workflows: [Verify Code Quality] + types: [completed] + concurrency: group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.event.pull_request.number || github.ref_name }} From 0ae217d76f36865fba4d3b1c8b0727479a5a27e6 Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Mon, 25 Aug 2025 11:21:41 -0700 Subject: [PATCH 07/23] trying to test workflow --- .github/workflows/run_llm_tests.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/run_llm_tests.yaml b/.github/workflows/run_llm_tests.yaml index 55f80fb9..33bc0785 100644 --- a/.github/workflows/run_llm_tests.yaml +++ b/.github/workflows/run_llm_tests.yaml @@ -1,6 +1,7 @@ name: Test non-llm components on: + workflow_dispatch: {} workflow_run: workflows: [Verify Code Quality] types: [completed] From aa7760afe86833246e5e06f1f6fc9e1cd8e3a13a Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Mon, 25 Aug 2025 11:56:45 -0700 Subject: [PATCH 08/23] adding the tests to the quality workflow --- .github/workflows/quality.yml | 16 +++++++++-- .github/workflows/run_llm_tests.yaml | 42 ---------------------------- 2 files changed, 14 insertions(+), 44 deletions(-) delete mode 100644 .github/workflows/run_llm_tests.yaml diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml index 7634b8be..d2bdb5d3 100644 --- a/.github/workflows/quality.yml +++ b/.github/workflows/quality.yml @@ -31,9 +31,21 @@ jobs: path: ~/.cache/pre-commit key: pre-commit|${{ env.PY }}|${{ hashFiles('.pre-commit-config.yaml') }} - name: Install dependencies - run: uv sync --frozen --all-extras + run: uv sync --frozen --all-extras --group dev - name: Check style and run tests run: pre-commit run --all-files - - name: Send failure message + - name: Send failure message pre-commit if: failure() # This step will only run if a previous step failed run: echo "The quality verification failed. Please run precommit " + - name: Install Ollama + run: curl -fsSL https://ollama.com/install.sh | sh + - name: Pull Llama 3.2:1b model + run: ollama pull llama3.2:1b + - name: Start serving the model + run: nohup ollama serve & + - name: Run Tests + run: uv run -m pytest -v test + - name: Send failure message tests + if: failure() # This step will only run if a previous step failed + run: echo "Tests failed. Please verify that tests are working locally." + diff --git a/.github/workflows/run_llm_tests.yaml b/.github/workflows/run_llm_tests.yaml deleted file mode 100644 index 33bc0785..00000000 --- a/.github/workflows/run_llm_tests.yaml +++ /dev/null @@ -1,42 +0,0 @@ -name: Test non-llm components - -on: - workflow_dispatch: {} - workflow_run: - workflows: [Verify Code Quality] - types: [completed] - - -concurrency: - group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.event.pull_request.number || github.ref_name }} - cancel-in-progress: true - -jobs: - tests: - if: ${{ github.event.workflow_run.conclusion == 'success' }} - runs-on: ubuntu-latest - timeout-minutes: 30 - strategy: - matrix: - python-version: ['3.10', '3.11', '3.12'] # Need to add 3.13 once we resolve outlines issues. - env: - GITHUB_ACTIONS: 1 - - steps: - - uses: actions/checkout@v4 - - name: Install uv and set the python version - uses: astral-sh/setup-uv@v5 - with: - python-version: ${{ matrix.python-version }} - enable-cache: true - - name: Install dependencies - run: uv sync --frozen --all-extras --group dev - - name: Install Ollama - run: curl -fsSL https://ollama.com/install.sh | sh - - name: Pull Llama 3.2:1b model - run: ollama pull llama3.2:1b - - name: Start serving the model - run: nohup ollama serve & - - - name: Run Tests - run: uv run -m pytest -v test From 89b4834ac910494209aea787e18c0dddf91f26d0 Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Mon, 25 Aug 2025 12:01:51 -0700 Subject: [PATCH 09/23] adding env variable to disable tests --- .github/workflows/quality.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml index d2bdb5d3..42517e22 100644 --- a/.github/workflows/quality.yml +++ b/.github/workflows/quality.yml @@ -17,6 +17,8 @@ jobs: strategy: matrix: python-version: ['3.10', '3.11', '3.12'] # Need to add 3.13 once we resolve outlines issues. + env: + GITHUB_ACTIONS: 1 steps: - uses: actions/checkout@v4 - name: Install uv and set the python version From 8e6191056976cbbb6a197012d2ac2deba14bd870 Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Mon, 25 Aug 2025 15:57:03 -0700 Subject: [PATCH 10/23] chaning marker name llm -> qualitative --- pyproject.toml | 2 +- test/conftest.py | 26 ++------------------------ 2 files changed, 3 insertions(+), 25 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3eebf620..6614b40a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -164,5 +164,5 @@ python_version = "3.10" [tool.pytest.ini_options] markers = [ - "llm: Marks the test as needing an exact output from an LLM (deselect with '-m \" not llm\"'); this depends on the session.backend.model_id" + "qualitative: Marks the test as needing an exact output from an LLM; set by an ENV variable for GITHUB_ACTIONS. All tests marked with this will xfail in CI/CD" ] \ No newline at end of file diff --git a/test/conftest.py b/test/conftest.py index e81414f2..7ded3d8b 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -15,34 +15,12 @@ def gh_run() -> int: def pytest_runtest_setup(item): # Runs tests *not* marked with `@pytest.mark.llm` to run normally. - if not item.get_closest_marker("llm"): + if not item.get_closest_marker("qualitative"): return gh_run = int(os.environ.get("GITHUB_ACTION", 0)) if gh_run == 1: pytest.xfail( - reason="Skipping LLM test: got env variable GITHUB_ACTION == 1. Used only in gh workflows." + reason="Skipping qualitative test: got env variable GITHUB_ACTION == 1. Used only in gh workflows." ) - - # # Check if there is a session fixture. - # try: - # session: MelleaSession = item._request.getfixturevalue("m_session") - # except Exception: - # # Skip test cause all llm marked tests need a session fixture. - # pytest.skip("`llm` marked tests requires a `m_session` fixture.") - # # Get the Ollama name. - # if isinstance(session.backend, OllamaModelBackend) or isinstance(session.backend, OpenAIBackend): - # model_id = session.backend.model_id.ollama_name - # # Skip tests of the model name is llama 1b - # if model_id == "llama3.2:1b": - # pytest.skip( - # "Skipping LLM test: got model_id == llama3.2:1b in ollama. Used only in gh workflows." - # ) - # elif isinstance(session.backend, LocalHFBackend): - # model_id = session.backend.model_id.hf_model_name - # # Skip tests of the model name is llama 1b - # if model_id == "unsloth/Llama-3.2-1B": - # pytest.skip( - # "Skipping LLM test: got model_id == unsloth/Llama-3.2-1B in hf. Used only in gh workflows." - # ) From cb754f2dcfed8392712a2a61c21dbd1b7ea8c8e9 Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Mon, 25 Aug 2025 15:58:59 -0700 Subject: [PATCH 11/23] changing test markers --- .github/workflows/quality.yml | 2 +- test/backends/test_huggingface.py | 23 +++++++++---------- test/backends/test_ollama.py | 10 ++++---- test/backends/test_openai_ollama.py | 6 ++--- test/backends/test_watsonx.py | 8 +++---- test/conftest.py | 2 +- test/stdlib_basics/test_contextual_session.py | 2 +- test/stdlib_basics/test_genslot.py | 2 +- 8 files changed, 27 insertions(+), 28 deletions(-) diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml index 42517e22..6c6c3da8 100644 --- a/.github/workflows/quality.yml +++ b/.github/workflows/quality.yml @@ -18,7 +18,7 @@ jobs: matrix: python-version: ['3.10', '3.11', '3.12'] # Need to add 3.13 once we resolve outlines issues. env: - GITHUB_ACTIONS: 1 + GITHUB_ACTION: 1 steps: - uses: actions/checkout@v4 - name: Install uv and set the python version diff --git a/test/backends/test_huggingface.py b/test/backends/test_huggingface.py index f91711c2..68c907d9 100644 --- a/test/backends/test_huggingface.py +++ b/test/backends/test_huggingface.py @@ -21,7 +21,6 @@ @pytest.fixture(scope="module") def backend(): """Shared HuggingFace backend for all tests in this module.""" - # TODO: find a smalle 1B model to do Alora stuff on github actions. backend = LocalHFBackend( model_id="ibm-granite/granite-3.2-8b-instruct", formatter=TemplateFormatter(model_id="ibm-granite/granite-4.0-tiny-preview"), @@ -38,7 +37,7 @@ def session(backend): yield session session.reset() -@pytest.mark.llm +@pytest.mark.qualitative def test_system_prompt(session): result = session.chat( "Where are we going?", @@ -46,7 +45,7 @@ def test_system_prompt(session): ) print(result) -@pytest.mark.llm +@pytest.mark.qualitative def test_constraint_alora(session, backend): answer = session.instruct( "Corporate wants you to find the difference between these two strings: aaaaaaaaaa aaaaabaaaa. Be concise and don't write code to answer the question.", @@ -64,7 +63,7 @@ def test_constraint_alora(session, backend): ) assert alora_output in ["Y", "N"], alora_output -@pytest.mark.llm +@pytest.mark.qualitative def test_constraint_lora_with_requirement(session, backend): answer = session.instruct( "Corporate wants you to find the difference between these two strings: aaaaaaaaaa aaaaabaaaa" @@ -80,7 +79,7 @@ def test_constraint_lora_with_requirement(session, backend): assert isinstance(val_result, ValidationResult) assert str(val_result.reason) in ["Y", "N"] -@pytest.mark.llm +@pytest.mark.qualitative def test_constraint_lora_override(session, backend): backend.default_to_constraint_checking_alora = False # type: ignore answer = session.instruct( @@ -95,7 +94,7 @@ def test_constraint_lora_override(session, backend): assert isinstance(default_output_to_bool(str(val_result.reason)), bool) backend.default_to_constraint_checking_alora = True -@pytest.mark.llm +@pytest.mark.qualitative def test_constraint_lora_override_does_not_override_alora(session, backend): backend.default_to_constraint_checking_alora = False # type: ignore answer = session.instruct( @@ -112,7 +111,7 @@ def test_constraint_lora_override_does_not_override_alora(session, backend): assert str(val_result.reason) in ["Y", "N"] backend.default_to_constraint_checking_alora = True -@pytest.mark.llm +@pytest.mark.qualitative def test_llmaj_req_does_not_use_alora(session, backend): backend.default_to_constraint_checking_alora = True # type: ignore answer = session.instruct( @@ -128,12 +127,12 @@ def test_llmaj_req_does_not_use_alora(session, backend): assert isinstance(val_result, ValidationResult) assert str(val_result.reason) not in ["Y", "N"] -@pytest.mark.llm +@pytest.mark.qualitative def test_instruct(session): result = session.instruct("Compute 1+1.") print(result) -@pytest.mark.llm +@pytest.mark.qualitative def test_multiturn(session): session.instruct("Compute 1+1") beta = session.instruct( @@ -143,7 +142,7 @@ def test_multiturn(session): words = session.instruct("Now list five English words that start with that letter.") print(words) -@pytest.mark.llm +@pytest.mark.qualitative def test_format(session): class Person(pydantic.BaseModel): name: str @@ -173,7 +172,7 @@ class Email(pydantic.BaseModel): "The email address should be at example.com" ) -@pytest.mark.llm +@pytest.mark.qualitative def test_generate_from_raw(session): prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"] @@ -183,7 +182,7 @@ def test_generate_from_raw(session): assert len(results) == len(prompts) -@pytest.mark.llm +@pytest.mark.qualitative def test_generate_from_raw_with_format(session): prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"] diff --git a/test/backends/test_ollama.py b/test/backends/test_ollama.py index fdba285a..b90d93fb 100644 --- a/test/backends/test_ollama.py +++ b/test/backends/test_ollama.py @@ -18,7 +18,7 @@ def session(): session.reset() -@pytest.mark.llm +@pytest.mark.qualitative def test_simple_instruct(session): result = session.instruct( "Write an email to Hendrik trying to sell him self-sealing stembolts." @@ -28,7 +28,7 @@ def test_simple_instruct(session): assert result._meta["chat_response"].message.role == "assistant" -@pytest.mark.llm +@pytest.mark.qualitative def test_instruct_with_requirement(session): response = session.instruct( "Write an email to Hendrik convincing him to buy some self-sealing stembolts." @@ -51,14 +51,14 @@ def test_instruct_with_requirement(session): ) print(results) -@pytest.mark.llm +@pytest.mark.qualitative def test_chat(session): output_message = session.chat("What is 1+1?") assert "2" in output_message.content, ( f"Expected a message with content containing 2 but found {output_message}" ) -@pytest.mark.llm +@pytest.mark.qualitative def test_format(session): class Person(pydantic.BaseModel): name: str @@ -91,7 +91,7 @@ class Email(pydantic.BaseModel): # assert email.to.email_address.endswith("example.com") pass -@pytest.mark.llm +@pytest.mark.qualitative def test_generate_from_raw(session): prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"] diff --git a/test/backends/test_openai_ollama.py b/test/backends/test_openai_ollama.py index 200b539d..f3c928c3 100644 --- a/test/backends/test_openai_ollama.py +++ b/test/backends/test_openai_ollama.py @@ -37,13 +37,13 @@ def m_session(backend): yield session session.reset() -@pytest.mark.llm +@pytest.mark.qualitative def test_instruct(m_session): result = m_session.instruct("Compute 1+1.") assert isinstance(result, ModelOutputThunk) assert "2" in result.value # type: ignore -@pytest.mark.llm +@pytest.mark.qualitative def test_multiturn(m_session): m_session.instruct("What is the capital of France?") answer = m_session.instruct("Tell me the answer to the previous question.") @@ -64,7 +64,7 @@ def test_multiturn(m_session): # assert "granite3.3:8b" in result.value # self.m.reset() -@pytest.mark.llm +@pytest.mark.qualitative def test_format(m_session): class Person(pydantic.BaseModel): name: str diff --git a/test/backends/test_watsonx.py b/test/backends/test_watsonx.py index af8634b6..255f2164 100644 --- a/test/backends/test_watsonx.py +++ b/test/backends/test_watsonx.py @@ -29,21 +29,21 @@ def session(backend): session.reset() -@pytest.mark.llm +@pytest.mark.qualitative def test_instruct(session): result = session.instruct("Compute 1+1.") assert isinstance(result, ModelOutputThunk) assert "2" in result.value # type: ignore -@pytest.mark.llm +@pytest.mark.qualitative def test_multiturn(session): session.instruct("What is the capital of France?") answer = session.instruct("Tell me the answer to the previous question.") assert "Paris" in answer.value # type: ignore -@pytest.mark.llm +@pytest.mark.qualitative def test_format(session): class Person(pydantic.BaseModel): name: str @@ -77,7 +77,7 @@ class Email(pydantic.BaseModel): pass -@pytest.mark.llm +@pytest.mark.qualitative def test_generate_from_raw(session): prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"] diff --git a/test/conftest.py b/test/conftest.py index 7ded3d8b..906ab5f7 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -14,7 +14,7 @@ def gh_run() -> int: def pytest_runtest_setup(item): - # Runs tests *not* marked with `@pytest.mark.llm` to run normally. + # Runs tests *not* marked with `@pytest.mark.qualitative` to run normally. if not item.get_closest_marker("qualitative"): return diff --git a/test/stdlib_basics/test_contextual_session.py b/test/stdlib_basics/test_contextual_session.py index 4bff317f..1a290149 100644 --- a/test/stdlib_basics/test_contextual_session.py +++ b/test/stdlib_basics/test_contextual_session.py @@ -74,7 +74,7 @@ def test_generative_with_contextual_session(model_id): assert isinstance(summary, str) assert len(summary) > 0 -@pytest.mark.llm +@pytest.mark.qualitative def test_generative_backward_compatibility(model_id): """Test that generative slots still work with explicit session parameter.""" with start_session(model_id=model_id) as m: diff --git a/test/stdlib_basics/test_genslot.py b/test/stdlib_basics/test_genslot.py index 58cc974a..f47b7577 100644 --- a/test/stdlib_basics/test_genslot.py +++ b/test/stdlib_basics/test_genslot.py @@ -33,7 +33,7 @@ def test_func(session): write_email_component = write_me_an_email(session) assert isinstance(write_email_component, str) -@pytest.mark.llm +@pytest.mark.qualitative def test_sentiment_output(classify_sentiment_output): assert classify_sentiment_output in ["positive", "negative"] From 26d5dfcef123c75f1cff0b654307f8a6564ee9d4 Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Mon, 25 Aug 2025 16:15:08 -0700 Subject: [PATCH 12/23] addressing PR comments --- test/backends/test_huggingface.py | 16 +++++++++++----- test/stdlib_basics/test_contextual_session.py | 2 +- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/test/backends/test_huggingface.py b/test/backends/test_huggingface.py index 68c907d9..08213667 100644 --- a/test/backends/test_huggingface.py +++ b/test/backends/test_huggingface.py @@ -37,7 +37,7 @@ def session(backend): yield session session.reset() -@pytest.mark.qualitative + def test_system_prompt(session): result = session.chat( "Where are we going?", @@ -45,7 +45,7 @@ def test_system_prompt(session): ) print(result) -@pytest.mark.qualitative + def test_constraint_alora(session, backend): answer = session.instruct( "Corporate wants you to find the difference between these two strings: aaaaaaaaaa aaaaabaaaa. Be concise and don't write code to answer the question.", @@ -63,7 +63,7 @@ def test_constraint_alora(session, backend): ) assert alora_output in ["Y", "N"], alora_output -@pytest.mark.qualitative + def test_constraint_lora_with_requirement(session, backend): answer = session.instruct( "Corporate wants you to find the difference between these two strings: aaaaaaaaaa aaaaabaaaa" @@ -79,6 +79,7 @@ def test_constraint_lora_with_requirement(session, backend): assert isinstance(val_result, ValidationResult) assert str(val_result.reason) in ["Y", "N"] + @pytest.mark.qualitative def test_constraint_lora_override(session, backend): backend.default_to_constraint_checking_alora = False # type: ignore @@ -94,6 +95,7 @@ def test_constraint_lora_override(session, backend): assert isinstance(default_output_to_bool(str(val_result.reason)), bool) backend.default_to_constraint_checking_alora = True + @pytest.mark.qualitative def test_constraint_lora_override_does_not_override_alora(session, backend): backend.default_to_constraint_checking_alora = False # type: ignore @@ -111,6 +113,7 @@ def test_constraint_lora_override_does_not_override_alora(session, backend): assert str(val_result.reason) in ["Y", "N"] backend.default_to_constraint_checking_alora = True + @pytest.mark.qualitative def test_llmaj_req_does_not_use_alora(session, backend): backend.default_to_constraint_checking_alora = True # type: ignore @@ -127,11 +130,12 @@ def test_llmaj_req_does_not_use_alora(session, backend): assert isinstance(val_result, ValidationResult) assert str(val_result.reason) not in ["Y", "N"] -@pytest.mark.qualitative + def test_instruct(session): result = session.instruct("Compute 1+1.") print(result) + @pytest.mark.qualitative def test_multiturn(session): session.instruct("Compute 1+1") @@ -142,6 +146,7 @@ def test_multiturn(session): words = session.instruct("Now list five English words that start with that letter.") print(words) + @pytest.mark.qualitative def test_format(session): class Person(pydantic.BaseModel): @@ -172,7 +177,7 @@ class Email(pydantic.BaseModel): "The email address should be at example.com" ) -@pytest.mark.qualitative + def test_generate_from_raw(session): prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"] @@ -182,6 +187,7 @@ def test_generate_from_raw(session): assert len(results) == len(prompts) + @pytest.mark.qualitative def test_generate_from_raw_with_format(session): prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"] diff --git a/test/stdlib_basics/test_contextual_session.py b/test/stdlib_basics/test_contextual_session.py index 1a290149..a142f879 100644 --- a/test/stdlib_basics/test_contextual_session.py +++ b/test/stdlib_basics/test_contextual_session.py @@ -160,7 +160,7 @@ def test_all_convenience_functions(model_id): def test_session_with_parameters(model_id): """Test contextual session with custom parameters.""" - with start_session(backend_name="ollama", model_id=META_LLAMA_3_2_1B) as m: + with start_session(backend_name="ollama", model_id=model_id) as m: result = instruct("test with parameters") assert isinstance(result, ModelOutputThunk) assert isinstance(m, MelleaSession) From 6102d2e7fed70a42b954c78b00b0b498977f7570 Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Mon, 25 Aug 2025 16:51:59 -0700 Subject: [PATCH 13/23] changing ollama port --- test/backends/test_openai_ollama.py | 6 ++++-- test/stdlib_basics/test_session.py | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/test/backends/test_openai_ollama.py b/test/backends/test_openai_ollama.py index f3c928c3..def41004 100644 --- a/test/backends/test_openai_ollama.py +++ b/test/backends/test_openai_ollama.py @@ -1,4 +1,6 @@ # test/rits_backend_tests/test_openai_integration.py +import os + import pydantic import pytest from typing_extensions import Annotated @@ -18,14 +20,14 @@ def backend(gh_run: int): return OpenAIBackend( model_id=META_LLAMA_3_2_1B, formatter=TemplateFormatter(model_id=META_LLAMA_3_2_1B), - base_url="http://localhost:11434/v1", + base_url=f"http://{os.environ.get('OLLAMA_HOST', 'localhost:11434')}/v1", api_key="ollama", ) else: return OpenAIBackend( model_id="granite3.3:8b", formatter=TemplateFormatter(model_id="ibm-granite/granite-3.2-8b-instruct"), - base_url="http://localhost:11434/v1", + base_url=f"http://{os.environ.get('OLLAMA_HOST', 'localhost:11434')}/v1", api_key="ollama", ) diff --git a/test/stdlib_basics/test_session.py b/test/stdlib_basics/test_session.py index 7c592ac4..6ce61c14 100644 --- a/test/stdlib_basics/test_session.py +++ b/test/stdlib_basics/test_session.py @@ -16,14 +16,14 @@ def test_start_session_openai_with_kwargs(gh_run): m = start_session( "openai", model_id="llama3.2:1b", - base_url="http://localhost:11434/v1", + base_url=f"http://{os.environ.get('OLLAMA_HOST', 'localhost:11434')}/v1", api_key="ollama", ) else: m = start_session( "openai", model_id="granite3.3:8b", - base_url="http://localhost:11434/v1", + base_url=f"http://{os.environ.get('OLLAMA_HOST', 'localhost:11434')}/v1", api_key="ollama", ) response = m.instruct("testing") From 752d43d5acc6537428442b26331f09f9eede4f19 Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Mon, 25 Aug 2025 16:58:43 -0700 Subject: [PATCH 14/23] changing ollama port --- .github/workflows/quality.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml index 6c6c3da8..7e41015f 100644 --- a/.github/workflows/quality.yml +++ b/.github/workflows/quality.yml @@ -19,6 +19,7 @@ jobs: python-version: ['3.10', '3.11', '3.12'] # Need to add 3.13 once we resolve outlines issues. env: GITHUB_ACTION: 1 + OLLAMA_HOST: "127.0.0.1:5000" steps: - uses: actions/checkout@v4 - name: Install uv and set the python version From c0c76a4199a71a8a2efc606ea15ac61b1802872a Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Mon, 25 Aug 2025 17:03:17 -0700 Subject: [PATCH 15/23] changing ollama order --- .github/workflows/quality.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml index 7e41015f..1e3a97ba 100644 --- a/.github/workflows/quality.yml +++ b/.github/workflows/quality.yml @@ -42,10 +42,11 @@ jobs: run: echo "The quality verification failed. Please run precommit " - name: Install Ollama run: curl -fsSL https://ollama.com/install.sh | sh + - name: Start serving ollama + run: nohup ollama serve & - name: Pull Llama 3.2:1b model run: ollama pull llama3.2:1b - - name: Start serving the model - run: nohup ollama serve & + - name: Run Tests run: uv run -m pytest -v test - name: Send failure message tests From 7f9f8ced73fc6e9e40327435b8ae6daf1d8c2082 Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Mon, 25 Aug 2025 17:11:08 -0700 Subject: [PATCH 16/23] skipping hf tests till we have a 1b alora --- test/backends/test_huggingface.py | 10 +++++----- test/conftest.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/test/backends/test_huggingface.py b/test/backends/test_huggingface.py index 08213667..6859099a 100644 --- a/test/backends/test_huggingface.py +++ b/test/backends/test_huggingface.py @@ -37,7 +37,7 @@ def session(backend): yield session session.reset() - +@pytest.mark.qualitative def test_system_prompt(session): result = session.chat( "Where are we going?", @@ -45,7 +45,7 @@ def test_system_prompt(session): ) print(result) - +@pytest.mark.qualitative def test_constraint_alora(session, backend): answer = session.instruct( "Corporate wants you to find the difference between these two strings: aaaaaaaaaa aaaaabaaaa. Be concise and don't write code to answer the question.", @@ -63,7 +63,7 @@ def test_constraint_alora(session, backend): ) assert alora_output in ["Y", "N"], alora_output - +@pytest.mark.qualitative def test_constraint_lora_with_requirement(session, backend): answer = session.instruct( "Corporate wants you to find the difference between these two strings: aaaaaaaaaa aaaaabaaaa" @@ -130,7 +130,7 @@ def test_llmaj_req_does_not_use_alora(session, backend): assert isinstance(val_result, ValidationResult) assert str(val_result.reason) not in ["Y", "N"] - +@pytest.mark.qualitative def test_instruct(session): result = session.instruct("Compute 1+1.") print(result) @@ -177,7 +177,7 @@ class Email(pydantic.BaseModel): "The email address should be at example.com" ) - +@pytest.mark.qualitative def test_generate_from_raw(session): prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"] diff --git a/test/conftest.py b/test/conftest.py index 906ab5f7..7b4fbff3 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -21,6 +21,6 @@ def pytest_runtest_setup(item): gh_run = int(os.environ.get("GITHUB_ACTION", 0)) if gh_run == 1: - pytest.xfail( + pytest.skip( reason="Skipping qualitative test: got env variable GITHUB_ACTION == 1. Used only in gh workflows." ) From 773b398d2549ca18f3a68a0dc635ef471e49fd70 Mon Sep 17 00:00:00 2001 From: Jake LoRocco Date: Tue, 26 Aug 2025 09:05:46 -0400 Subject: [PATCH 17/23] skip rich doc test that takes too much memory --- test/stdlib_basics/test_richdocument.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/stdlib_basics/test_richdocument.py b/test/stdlib_basics/test_richdocument.py index 8809b1a0..6046ea96 100644 --- a/test/stdlib_basics/test_richdocument.py +++ b/test/stdlib_basics/test_richdocument.py @@ -93,6 +93,7 @@ def test_empty_table(): assert table is None, "table should be empty when supplied string is empty" +@pytest.mark.skip # Test requires too much memory for smaller machines. def test_richdocument_generation(rd: RichDocument): m = mellea.start_session(backend_name="hf") response = m.chat(rd.to_markdown()[:500] + "\nSummarize the provided document.") From b432a2a61f94937b0a52232b544c83e35e8b762b Mon Sep 17 00:00:00 2001 From: Jake LoRocco Date: Tue, 26 Aug 2025 09:07:02 -0400 Subject: [PATCH 18/23] remove unused session functions --- mellea/stdlib/session.py | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/mellea/stdlib/session.py b/mellea/stdlib/session.py index 0d2d6f97..0250b768 100644 --- a/mellea/stdlib/session.py +++ b/mellea/stdlib/session.py @@ -414,31 +414,6 @@ def validate( return rvs - def req(self, *args, **kwargs): - """Shorthand for Requirement.__init__(...).""" - return req(*args, **kwargs) - - def check(self, *args, **kwargs): - """Shorthand for Requirement.__init__(..., check_only=True).""" - return check(*args, **kwargs) - - def load_default_aloras(self): - """Loads the default Aloras for this model, if they exist and if the backend supports.""" - from mellea.backends.huggingface import LocalHFBackend - - if self.backend.model_id == IBM_GRANITE_3_2_8B and isinstance( - self.backend, LocalHFBackend - ): - from mellea.backends.aloras.huggingface.granite_aloras import ( - add_granite_aloras, - ) - - add_granite_aloras(self.backend) - return - self._session_logger.warning( - "This model/backend combination does not support any aloras." - ) - def genslot( self, gen_slot: Component, From 16e909d3fec052f9d901cc8fbfa6d8b62fcc39e2 Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Tue, 26 Aug 2025 08:03:54 -0700 Subject: [PATCH 19/23] changing env var name --- .github/workflows/quality.yml | 2 +- pyproject.toml | 2 +- test/conftest.py | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml index 1e3a97ba..d68b457f 100644 --- a/.github/workflows/quality.yml +++ b/.github/workflows/quality.yml @@ -18,7 +18,7 @@ jobs: matrix: python-version: ['3.10', '3.11', '3.12'] # Need to add 3.13 once we resolve outlines issues. env: - GITHUB_ACTION: 1 + CICD: 1 OLLAMA_HOST: "127.0.0.1:5000" steps: - uses: actions/checkout@v4 diff --git a/pyproject.toml b/pyproject.toml index 6614b40a..4208cdf8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -164,5 +164,5 @@ python_version = "3.10" [tool.pytest.ini_options] markers = [ - "qualitative: Marks the test as needing an exact output from an LLM; set by an ENV variable for GITHUB_ACTIONS. All tests marked with this will xfail in CI/CD" + "qualitative: Marks the test as needing an exact output from an LLM; set by an ENV variable for CICD. All tests marked with this will xfail in CI/CD" ] \ No newline at end of file diff --git a/test/conftest.py b/test/conftest.py index 7b4fbff3..6e5d83c6 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -10,7 +10,7 @@ @pytest.fixture(scope="session") def gh_run() -> int: - return int(os.environ.get("GITHUB_ACTION", 0)) # type: ignore + return int(os.environ.get("CICD", 0)) # type: ignore def pytest_runtest_setup(item): @@ -18,9 +18,9 @@ def pytest_runtest_setup(item): if not item.get_closest_marker("qualitative"): return - gh_run = int(os.environ.get("GITHUB_ACTION", 0)) + gh_run = int(os.environ.get("CICD", 0)) if gh_run == 1: pytest.skip( - reason="Skipping qualitative test: got env variable GITHUB_ACTION == 1. Used only in gh workflows." + reason="Skipping qualitative test: got env variable CICD == 1. Used only in gh workflows." ) From 5c0c3eea6bd40f40755cda57e9aea6b6b29a3272 Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Tue, 26 Aug 2025 08:20:33 -0700 Subject: [PATCH 20/23] minor changes --- test/backends/test_watsonx.py | 26 ++++++++++++++++---------- test/stdlib_basics/test_session.py | 2 ++ 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/test/backends/test_watsonx.py b/test/backends/test_watsonx.py index 255f2164..85dedd66 100644 --- a/test/backends/test_watsonx.py +++ b/test/backends/test_watsonx.py @@ -15,36 +15,42 @@ @pytest.fixture(scope="module") def backend(): """Shared Watson backend for all tests in this module.""" - return WatsonxAIBackend( + if os.environ.get("CICD") == 1: + pytest.skip("Skipping watsonx tests.") + else: + return WatsonxAIBackend( model_id="ibm/granite-3-3-8b-instruct", formatter=TemplateFormatter(model_id="ibm-granite/granite-3.3-8b-instruct"), ) @pytest.fixture(scope="function") -def session(backend): - """Fresh Watson session for each test.""" - session = MelleaSession(backend, ctx=LinearContext(is_chat_context=True)) - yield session - session.reset() +def session(backend: WatsonxAIBackend): + if os.environ.get("CICD") == 1: + pytest.skip("Skipping watsonx tests.") + else: + """Fresh Watson session for each test.""" + session = MelleaSession(backend, ctx=LinearContext(is_chat_context=True)) + yield session + session.reset() @pytest.mark.qualitative -def test_instruct(session): +def test_instruct(session: MelleaSession): result = session.instruct("Compute 1+1.") assert isinstance(result, ModelOutputThunk) assert "2" in result.value # type: ignore @pytest.mark.qualitative -def test_multiturn(session): +def test_multiturn(session: MelleaSession): session.instruct("What is the capital of France?") answer = session.instruct("Tell me the answer to the previous question.") assert "Paris" in answer.value # type: ignore @pytest.mark.qualitative -def test_format(session): +def test_format(session: MelleaSession): class Person(pydantic.BaseModel): name: str # it does not support regex patterns in json schema @@ -78,7 +84,7 @@ class Email(pydantic.BaseModel): @pytest.mark.qualitative -def test_generate_from_raw(session): +def test_generate_from_raw(session: MelleaSession): prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"] results = session.backend._generate_from_raw( diff --git a/test/stdlib_basics/test_session.py b/test/stdlib_basics/test_session.py index 6ce61c14..8b9ff7d1 100644 --- a/test/stdlib_basics/test_session.py +++ b/test/stdlib_basics/test_session.py @@ -1,3 +1,5 @@ +import os + import pytest from mellea.stdlib.base import ModelOutputThunk From 4fc84beecb2d81c869b010f6e26288ce59c9ee8d Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Tue, 26 Aug 2025 08:40:00 -0700 Subject: [PATCH 21/23] ignoring more watsonx for now --- test/conftest.py | 2 +- test/stdlib_basics/test_session.py | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/test/conftest.py b/test/conftest.py index 6e5d83c6..e95ce41b 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -21,6 +21,6 @@ def pytest_runtest_setup(item): gh_run = int(os.environ.get("CICD", 0)) if gh_run == 1: - pytest.skip( + pytest.xfail( reason="Skipping qualitative test: got env variable CICD == 1. Used only in gh workflows." ) diff --git a/test/stdlib_basics/test_session.py b/test/stdlib_basics/test_session.py index 8b9ff7d1..2d91df97 100644 --- a/test/stdlib_basics/test_session.py +++ b/test/stdlib_basics/test_session.py @@ -7,10 +7,13 @@ def test_start_session_watsonx(): - m = start_session(backend_name="watsonx") - response = m.instruct("testing") - assert isinstance(response, ModelOutputThunk) - assert response.value is not None + if os.environ.get("CICD") == 1: + pytest.skip("Skipping watsonx tests.") + else: + m = start_session(backend_name="watsonx") + response = m.instruct("testing") + assert isinstance(response, ModelOutputThunk) + assert response.value is not None def test_start_session_openai_with_kwargs(gh_run): From 6b86010600fa9dc85debab0f1ea13fa8e0bf3c36 Mon Sep 17 00:00:00 2001 From: Avinash Balakrishnan Date: Tue, 26 Aug 2025 08:50:24 -0700 Subject: [PATCH 22/23] minor changes --- test/stdlib_basics/test_session.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/stdlib_basics/test_session.py b/test/stdlib_basics/test_session.py index 2d91df97..9caa8d6f 100644 --- a/test/stdlib_basics/test_session.py +++ b/test/stdlib_basics/test_session.py @@ -6,8 +6,8 @@ from mellea.stdlib.session import start_session -def test_start_session_watsonx(): - if os.environ.get("CICD") == 1: +def test_start_session_watsonx(gh_run): + if gh_run == 1: pytest.skip("Skipping watsonx tests.") else: m = start_session(backend_name="watsonx") From e056768decd40b77cf977fcae76b71171f1f8562 Mon Sep 17 00:00:00 2001 From: Jake LoRocco Date: Tue, 26 Aug 2025 12:38:57 -0400 Subject: [PATCH 23/23] fix non-duplicate member func for mify in python 3.11 --- mellea/stdlib/mify.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/mellea/stdlib/mify.py b/mellea/stdlib/mify.py index 639bbc80..5b278f2d 100644 --- a/mellea/stdlib/mify.py +++ b/mellea/stdlib/mify.py @@ -132,8 +132,8 @@ def _get_all_fields(self) -> dict[str, Any]: if self._fields_exclude: fields_exclude = self._fields_exclude - # This includes fields defined by any superclasses, as long as it's not object. - all_fields = _get_non_duplicate_fields(self, object) + # This includes fields defined by any superclasses, as long as it's not Protocol. + all_fields = _get_non_duplicate_fields(self, Protocol) # It does matter if include is an empty set. Handle it's cases here. if self._fields_include is not None: @@ -366,18 +366,15 @@ def mification(obj: T) -> T: def _get_non_duplicate_members( - object: object, check_duplicates: object + obj: object, check_duplicates: object ) -> dict[str, Callable]: """Returns all methods/functions unique to the object.""" members = dict( inspect.getmembers( - object, + obj, # Checks for ismethod or isfunction because of the methods added from the MifiedProtocol. - predicate=lambda x: inspect.ismethod(x) - or ( - inspect.isfunction(x) - and x.__name__ not in dict(inspect.getmembers(check_duplicates)).keys() - ), + predicate=lambda x: (inspect.ismethod(x) or inspect.isfunction(x)) + and x.__name__ not in dict(inspect.getmembers(check_duplicates)).keys(), ) ) return members