From 9498f0fac93af200f1dbedd79a15481721276013 Mon Sep 17 00:00:00 2001
From: Avinash Balakrishnan <avinash2692@gmail.com>
Date: Fri, 22 Aug 2025 10:06:51 -0700
Subject: [PATCH 01/23] adding conftest.py for test configs

---
 test/conftest.py | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 test/conftest.py

diff --git a/test/conftest.py b/test/conftest.py
new file mode 100644
index 00000000..a4ac821f
--- /dev/null
+++ b/test/conftest.py
@@ -0,0 +1,48 @@
+import os
+
+import pytest
+
+from mellea.backends.huggingface import LocalHFBackend
+from mellea.backends.ollama import OllamaModelBackend
+from mellea.backends.openai import OpenAIBackend
+from mellea.stdlib.session import MelleaSession
+
+
+@pytest.fixture(scope="session")
+def gh_run() -> int:
+    return int(os.environ.get("GITHUB_ACTION", 0))  # type: ignore
+
+
+def pytest_runtest_setup(item):
+    # Runs tests *not* marked with `@pytest.mark.llm` to run normally.
+    if not item.get_closest_marker("llm"):
+        return
+
+    gh_run = int(os.environ.get("GITHUB_ACTION", 0))
+
+    if gh_run == 1:
+        pytest.skip(
+            reason="Skipping LLM test: got env variable GITHUB_ACTION == 1. Used only in gh workflows."
+        )
+
+    # # Check if there is a session fixture.
+    # try:
+    #     session: MelleaSession = item._request.getfixturevalue("m_session")
+    # except Exception:
+    #     # Skip test cause all llm marked tests need a session fixture.
+    #     pytest.skip("`llm` marked tests requires a `m_session` fixture.")
+    # # Get the Ollama name.
+    # if isinstance(session.backend, OllamaModelBackend) or isinstance(session.backend, OpenAIBackend):
+    #     model_id = session.backend.model_id.ollama_name
+    #     # Skip tests of the model name is llama 1b
+    #     if model_id == "llama3.2:1b":
+    #         pytest.skip(
+    #             "Skipping LLM test: got model_id == llama3.2:1b in ollama. Used only in gh workflows."
+    #         )
+    # elif isinstance(session.backend, LocalHFBackend):
+    #     model_id = session.backend.model_id.hf_model_name
+    #     # Skip tests of the model name is llama 1b
+    #     if model_id == "unsloth/Llama-3.2-1B":
+    #         pytest.skip(
+    #             "Skipping LLM test: got model_id == unsloth/Llama-3.2-1B in hf. Used only in gh workflows."
+    #         )

From 68cf8c6811398e6f6ea024d4bbeeca4ec671f7fd Mon Sep 17 00:00:00 2001
From: Avinash Balakrishnan <avinash2692@gmail.com>
Date: Mon, 25 Aug 2025 09:30:38 -0700
Subject: [PATCH 02/23] adding modified tests with optional LLM runs

---
 test/backends/test_huggingface.py             | 23 ++---
 test/backends/test_ollama.py                  | 31 ++++---
 test/backends/test_openai_ollama.py           | 48 +++++++----
 test/backends/test_watsonx.py                 | 22 +++--
 test/conftest.py                              |  2 +-
 test/stdlib_basics/test_base.py               |  3 +-
 test/stdlib_basics/test_chat_view.py          |  5 +-
 test/stdlib_basics/test_contextual_session.py | 85 +++++++++++--------
 test/stdlib_basics/test_genslot.py            |  2 +-
 test/stdlib_basics/test_session.py            | 15 +++-
 10 files changed, 142 insertions(+), 94 deletions(-)

diff --git a/test/backends/test_huggingface.py b/test/backends/test_huggingface.py
index 625f22d3..f91711c2 100644
--- a/test/backends/test_huggingface.py
+++ b/test/backends/test_huggingface.py
@@ -21,6 +21,7 @@
 @pytest.fixture(scope="module")
 def backend():
     """Shared HuggingFace backend for all tests in this module."""
+    # TODO: find a smalle 1B model to do Alora stuff on github actions.
     backend = LocalHFBackend(
         model_id="ibm-granite/granite-3.2-8b-instruct",
         formatter=TemplateFormatter(model_id="ibm-granite/granite-4.0-tiny-preview"),
@@ -37,7 +38,7 @@ def session(backend):
     yield session
     session.reset()
 
-
+@pytest.mark.llm
 def test_system_prompt(session):
     result = session.chat(
         "Where are we going?",
@@ -45,7 +46,7 @@ def test_system_prompt(session):
     )
     print(result)
 
-
+@pytest.mark.llm
 def test_constraint_alora(session, backend):
     answer = session.instruct(
         "Corporate wants you to find the difference between these two strings: aaaaaaaaaa aaaaabaaaa. Be concise and don't write code to answer the question.",
@@ -63,7 +64,7 @@ def test_constraint_alora(session, backend):
     )
     assert alora_output in ["Y", "N"], alora_output
 
-
+@pytest.mark.llm
 def test_constraint_lora_with_requirement(session, backend):
     answer = session.instruct(
         "Corporate wants you to find the difference between these two strings: aaaaaaaaaa aaaaabaaaa"
@@ -79,7 +80,7 @@ def test_constraint_lora_with_requirement(session, backend):
     assert isinstance(val_result, ValidationResult)
     assert str(val_result.reason) in ["Y", "N"]
 
-
+@pytest.mark.llm
 def test_constraint_lora_override(session, backend):
     backend.default_to_constraint_checking_alora = False  # type: ignore
     answer = session.instruct(
@@ -94,7 +95,7 @@ def test_constraint_lora_override(session, backend):
     assert isinstance(default_output_to_bool(str(val_result.reason)), bool)
     backend.default_to_constraint_checking_alora = True
 
-
+@pytest.mark.llm
 def test_constraint_lora_override_does_not_override_alora(session, backend):
     backend.default_to_constraint_checking_alora = False  # type: ignore
     answer = session.instruct(
@@ -111,7 +112,7 @@ def test_constraint_lora_override_does_not_override_alora(session, backend):
     assert str(val_result.reason) in ["Y", "N"]
     backend.default_to_constraint_checking_alora = True
 
-
+@pytest.mark.llm
 def test_llmaj_req_does_not_use_alora(session, backend):
     backend.default_to_constraint_checking_alora = True  # type: ignore
     answer = session.instruct(
@@ -127,12 +128,12 @@ def test_llmaj_req_does_not_use_alora(session, backend):
     assert isinstance(val_result, ValidationResult)
     assert str(val_result.reason) not in ["Y", "N"]
 
-
+@pytest.mark.llm
 def test_instruct(session):
     result = session.instruct("Compute 1+1.")
     print(result)
 
-
+@pytest.mark.llm
 def test_multiturn(session):
     session.instruct("Compute 1+1")
     beta = session.instruct(
@@ -142,7 +143,7 @@ def test_multiturn(session):
     words = session.instruct("Now list five English words that start with that letter.")
     print(words)
 
-
+@pytest.mark.llm
 def test_format(session):
     class Person(pydantic.BaseModel):
         name: str
@@ -172,7 +173,7 @@ class Email(pydantic.BaseModel):
         "The email address should be at example.com"
     )
 
-
+@pytest.mark.llm
 def test_generate_from_raw(session):
     prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"]
 
@@ -182,7 +183,7 @@ def test_generate_from_raw(session):
 
     assert len(results) == len(prompts)
 
-
+@pytest.mark.llm
 def test_generate_from_raw_with_format(session):
     prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"]
 
diff --git a/test/backends/test_ollama.py b/test/backends/test_ollama.py
index 78b07de4..fdba285a 100644
--- a/test/backends/test_ollama.py
+++ b/test/backends/test_ollama.py
@@ -1,11 +1,13 @@
-from mellea import start_session, SimpleContext
-from mellea.stdlib.base import CBlock
-from mellea.stdlib.requirement import Requirement
-import pydantic
 import json
+
+import pydantic
+import pytest
 from typing_extensions import Annotated
+
+from mellea import SimpleContext, start_session
 from mellea.backends.types import ModelOption
-import pytest
+from mellea.stdlib.base import CBlock
+from mellea.stdlib.requirement import Requirement
 
 
 @pytest.fixture(scope="function")
@@ -15,6 +17,8 @@ def session():
     yield session
     session.reset()
 
+
+@pytest.mark.llm
 def test_simple_instruct(session):
     result = session.instruct(
         "Write an email to Hendrik trying to sell him self-sealing stembolts."
@@ -23,6 +27,8 @@ def test_simple_instruct(session):
     assert "chat_response" in result._meta
     assert result._meta["chat_response"].message.role == "assistant"
 
+
+@pytest.mark.llm
 def test_instruct_with_requirement(session):
     response = session.instruct(
         "Write an email to Hendrik convincing him to buy some self-sealing stembolts."
@@ -45,12 +51,14 @@ def test_instruct_with_requirement(session):
     )
     print(results)
 
+@pytest.mark.llm
 def test_chat(session):
     output_message = session.chat("What is 1+1?")
-    assert (
-        "2" in output_message.content
-    ), f"Expected a message with content containing 2 but found {output_message}"
+    assert "2" in output_message.content, (
+        f"Expected a message with content containing 2 but found {output_message}"
+    )
 
+@pytest.mark.llm
 def test_format(session):
     class Person(pydantic.BaseModel):
         name: str
@@ -83,6 +91,7 @@ class Email(pydantic.BaseModel):
     # assert email.to.email_address.endswith("example.com")
     pass
 
+@pytest.mark.llm
 def test_generate_from_raw(session):
     prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"]
 
@@ -113,9 +122,9 @@ class Answer(pydantic.BaseModel):
     try:
         answer = Answer.model_validate_json(random_result.value)
     except pydantic.ValidationError as e:
-        assert (
-            False
-        ), f"formatting directive failed for {random_result.value}: {e.json()}"
+        assert False, (
+            f"formatting directive failed for {random_result.value}: {e.json()}"
+        )
 
 
 if __name__ == "__main__":
diff --git a/test/backends/test_openai_ollama.py b/test/backends/test_openai_ollama.py
index 5bf3b5f3..200b539d 100644
--- a/test/backends/test_openai_ollama.py
+++ b/test/backends/test_openai_ollama.py
@@ -1,41 +1,52 @@
 # test/rits_backend_tests/test_openai_integration.py
+import pydantic
+import pytest
+from typing_extensions import Annotated
+
 from mellea import MelleaSession
-from mellea.stdlib.base import CBlock, LinearContext, ModelOutputThunk
-from mellea.backends.openai import OpenAIBackend
 from mellea.backends.formatter import TemplateFormatter
+from mellea.backends.model_ids import META_LLAMA_3_2_1B
+from mellea.backends.openai import OpenAIBackend
 from mellea.backends.types import ModelOption
-
-import pydantic
-from typing_extensions import Annotated
-import pytest
+from mellea.stdlib.base import CBlock, LinearContext, ModelOutputThunk
 
 
 @pytest.fixture(scope="module")
-def backend():
+def backend(gh_run: int):
     """Shared OpenAI backend configured for Ollama."""
-    return OpenAIBackend(
-        model_id="granite3.3:8b",
-        formatter=TemplateFormatter(model_id="ibm-granite/granite-3.2-8b-instruct"),
+    if gh_run == 1:
+        return OpenAIBackend(
+        model_id=META_LLAMA_3_2_1B,
+        formatter=TemplateFormatter(model_id=META_LLAMA_3_2_1B),
         base_url="http://localhost:11434/v1",
         api_key="ollama",
     )
+    else:
+        return OpenAIBackend(
+            model_id="granite3.3:8b",
+            formatter=TemplateFormatter(model_id="ibm-granite/granite-3.2-8b-instruct"),
+            base_url="http://localhost:11434/v1",
+            api_key="ollama",
+        )
 
 
 @pytest.fixture(scope="function")
-def session(backend):
+def m_session(backend):
     """Fresh OpenAI session for each test."""
     session = MelleaSession(backend, ctx=LinearContext(is_chat_context=True))
     yield session
     session.reset()
 
-def test_instruct(session):
-    result = session.instruct("Compute 1+1.")
+@pytest.mark.llm
+def test_instruct(m_session):
+    result = m_session.instruct("Compute 1+1.")
     assert isinstance(result, ModelOutputThunk)
     assert "2" in result.value  # type: ignore
 
-def test_multiturn(session):
-    session.instruct("What is the capital of France?")
-    answer = session.instruct("Tell me the answer to the previous question.")
+@pytest.mark.llm
+def test_multiturn(m_session):
+    m_session.instruct("What is the capital of France?")
+    answer = m_session.instruct("Tell me the answer to the previous question.")
     assert "Paris" in answer.value  # type: ignore
 
     # def test_api_timeout_error(self):
@@ -53,7 +64,8 @@ def test_multiturn(session):
     #     assert "granite3.3:8b" in result.value
     #     self.m.reset()
 
-def test_format(session):
+@pytest.mark.llm
+def test_format(m_session):
     class Person(pydantic.BaseModel):
         name: str
         # it does not support regex patterns in json schema
@@ -68,7 +80,7 @@ class Email(pydantic.BaseModel):
         subject: str
         body: str
 
-    output = session.instruct(
+    output = m_session.instruct(
         "Write a short email to Olivia, thanking her for organizing a sailing activity. Her email server is example.com. No more than two sentences. ",
         format=Email,
         model_options={ModelOption.MAX_NEW_TOKENS: 2**8},
diff --git a/test/backends/test_watsonx.py b/test/backends/test_watsonx.py
index b62def03..af8634b6 100644
--- a/test/backends/test_watsonx.py
+++ b/test/backends/test_watsonx.py
@@ -1,14 +1,15 @@
 # test/rits_backend_tests/test_watsonx_integration.py
 import os
-from mellea import MelleaSession
-from mellea.stdlib.base import CBlock, LinearContext, ModelOutputThunk
-from mellea.backends.watsonx import WatsonxAIBackend
-from mellea.backends.formatter import TemplateFormatter
-from mellea.backends.types import ModelOption
 
 import pydantic
-from typing_extensions import Annotated
 import pytest
+from typing_extensions import Annotated
+
+from mellea import MelleaSession
+from mellea.backends.formatter import TemplateFormatter
+from mellea.backends.types import ModelOption
+from mellea.backends.watsonx import WatsonxAIBackend
+from mellea.stdlib.base import CBlock, LinearContext, ModelOutputThunk
 
 
 @pytest.fixture(scope="module")
@@ -28,18 +29,21 @@ def session(backend):
     session.reset()
 
 
-
-
+@pytest.mark.llm
 def test_instruct(session):
     result = session.instruct("Compute 1+1.")
     assert isinstance(result, ModelOutputThunk)
     assert "2" in result.value  # type: ignore
 
+
+@pytest.mark.llm
 def test_multiturn(session):
     session.instruct("What is the capital of France?")
     answer = session.instruct("Tell me the answer to the previous question.")
     assert "Paris" in answer.value  # type: ignore
 
+
+@pytest.mark.llm
 def test_format(session):
     class Person(pydantic.BaseModel):
         name: str
@@ -72,6 +76,8 @@ class Email(pydantic.BaseModel):
     # assert email.to.email_address.endswith("example.com")
     pass
 
+
+@pytest.mark.llm
 def test_generate_from_raw(session):
     prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"]
 
diff --git a/test/conftest.py b/test/conftest.py
index a4ac821f..e81414f2 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -21,7 +21,7 @@ def pytest_runtest_setup(item):
     gh_run = int(os.environ.get("GITHUB_ACTION", 0))
 
     if gh_run == 1:
-        pytest.skip(
+        pytest.xfail(
             reason="Skipping LLM test: got env variable GITHUB_ACTION == 1. Used only in gh workflows."
         )
 
diff --git a/test/stdlib_basics/test_base.py b/test/stdlib_basics/test_base.py
index 9ec99db6..6d4008bd 100644
--- a/test/stdlib_basics/test_base.py
+++ b/test/stdlib_basics/test_base.py
@@ -1,5 +1,4 @@
-from mellea.stdlib.base import Component, CBlock
-from mellea.stdlib.base import LinearContext
+from mellea.stdlib.base import CBlock, Component, LinearContext
 
 
 def test_cblock():
diff --git a/test/stdlib_basics/test_chat_view.py b/test/stdlib_basics/test_chat_view.py
index c0d9e9eb..c56b7c2e 100644
--- a/test/stdlib_basics/test_chat_view.py
+++ b/test/stdlib_basics/test_chat_view.py
@@ -1,7 +1,8 @@
 
 import pytest
-from mellea.stdlib.base import ModelOutputThunk, LinearContext
-from mellea.stdlib.chat import as_chat_history, Message
+
+from mellea.stdlib.base import LinearContext, ModelOutputThunk
+from mellea.stdlib.chat import Message, as_chat_history
 from mellea.stdlib.session import start_session
 
 
diff --git a/test/stdlib_basics/test_contextual_session.py b/test/stdlib_basics/test_contextual_session.py
index 97699831..4bff317f 100644
--- a/test/stdlib_basics/test_contextual_session.py
+++ b/test/stdlib_basics/test_contextual_session.py
@@ -1,10 +1,21 @@
-import pytest
 from typing import Literal
-from mellea import generative, start_session, instruct, chat, validate, query, transform
+
+import pytest
+
+from mellea import chat, generative, instruct, query, start_session, transform, validate
+from mellea.backends.model_ids import IBM_GRANITE_3_3_8B, META_LLAMA_3_2_1B
 from mellea.stdlib.base import ModelOutputThunk
-from mellea.stdlib.session import get_session, MelleaSession
-from mellea.stdlib.mify import mify, MifiedProtocol
+from mellea.stdlib.mify import MifiedProtocol, mify
 from mellea.stdlib.requirement import req
+from mellea.stdlib.session import MelleaSession, get_session
+
+
+@pytest.fixture(scope="module")
+def model_id(gh_run: int):
+    if gh_run == 1:
+        return META_LLAMA_3_2_1B
+    else:
+        return IBM_GRANITE_3_3_8B
 
 
 @generative
@@ -26,9 +37,9 @@ def get_info(self) -> str:
         return f"{self.name} is {self.age} years old"
 
 
-def test_basic_contextual_session():
+def test_basic_contextual_session(model_id):
     """Test basic contextual session usage with convenience functions."""
-    with start_session():
+    with start_session(model_id=model_id):
         # Test instruct
         result = instruct("Say hello")
         assert isinstance(result, ModelOutputThunk)
@@ -51,9 +62,9 @@ def test_no_active_session_error():
         chat("test")
 
 
-def test_generative_with_contextual_session():
+def test_generative_with_contextual_session(model_id):
     """Test generative slots work with contextual sessions."""
-    with start_session():
+    with start_session(model_id=model_id):
         # Test without explicit session parameter
         result = classify_sentiment(text="I love this!")
         assert result in ["positive", "negative"]
@@ -63,18 +74,18 @@ def test_generative_with_contextual_session():
         assert isinstance(summary, str)
         assert len(summary) > 0
 
-
-def test_generative_backward_compatibility():
+@pytest.mark.llm
+def test_generative_backward_compatibility(model_id):
     """Test that generative slots still work with explicit session parameter."""
-    with start_session() as m:
+    with start_session(model_id=model_id) as m:
         # Test old pattern still works
         result = classify_sentiment(m, text="I love this!")
         assert result in ["positive", "negative"]
 
 
-def test_mify_with_contextual_session():
+def test_mify_with_contextual_session(model_id):
     """Test mify functionality with contextual sessions."""
-    with start_session():
+    with start_session(model_id=model_id):
         person = TestPerson("Alice", 30)
         assert isinstance(person, MifiedProtocol)
 
@@ -88,13 +99,13 @@ def test_mify_with_contextual_session():
         assert transform_result is not None
 
 
-def test_nested_sessions():
+def test_nested_sessions(model_id):
     """Test nested sessions behavior."""
-    with start_session() as outer_session:
+    with start_session(model_id=model_id) as outer_session:
         outer_result = instruct("outer session test")
         assert isinstance(outer_result, ModelOutputThunk)
 
-        with start_session() as inner_session:
+        with start_session(model_id=model_id) as inner_session:
             # Inner session should be active
             current_session = get_session()
             assert current_session is inner_session
@@ -107,10 +118,10 @@ def test_nested_sessions():
         assert current_session is outer_session
 
 
-def test_session_cleanup():
+def test_session_cleanup(model_id):
     """Test session cleanup after context exit."""
     session_ref = None
-    with start_session() as m:
+    with start_session(model_id=model_id) as m:
         session_ref = m
         instruct("test during session")
 
@@ -119,19 +130,19 @@ def test_session_cleanup():
         get_session()
 
     # Session should have been cleaned up
-    assert hasattr(session_ref, 'ctx')
+    assert hasattr(session_ref, "ctx")
 
 
-def test_all_convenience_functions():
+def test_all_convenience_functions(model_id):
     """Test all convenience functions work within contextual session."""
-    with start_session():
+    with start_session(model_id=model_id):
         # Test instruct
         instruct_result = instruct("Generate a greeting")
         assert isinstance(instruct_result, ModelOutputThunk)
 
         # Test chat
         chat_result = chat("Hello there")
-        assert hasattr(chat_result, 'content')
+        assert hasattr(chat_result, "content")
 
         # Test validate
         validation = validate([req("The response should be positive")])
@@ -147,18 +158,18 @@ def test_all_convenience_functions():
         assert transform_result is not None
 
 
-def test_session_with_parameters():
+def test_session_with_parameters(model_id):
     """Test contextual session with custom parameters."""
-    with start_session(backend_name="ollama", model_id="granite3.3:8b") as m:
+    with start_session(backend_name="ollama", model_id=META_LLAMA_3_2_1B) as m:
         result = instruct("test with parameters")
         assert isinstance(result, ModelOutputThunk)
         assert isinstance(m, MelleaSession)
 
 
-def test_multiple_sequential_sessions():
+def test_multiple_sequential_sessions(model_id):
     """Test multiple sequential contextual sessions."""
     # First session
-    with start_session():
+    with start_session(model_id=model_id):
         result1 = instruct("first session")
         assert isinstance(result1, ModelOutputThunk)
 
@@ -167,14 +178,14 @@ def test_multiple_sequential_sessions():
         get_session()
 
     # Second session
-    with start_session():
+    with start_session(model_id=model_id):
         result2 = instruct("second session")
         assert isinstance(result2, ModelOutputThunk)
 
 
-def test_contextual_session_with_mified_object_methods():
+def test_contextual_session_with_mified_object_methods(model_id):
     """Test that mified objects work properly within contextual sessions."""
-    with start_session():
+    with start_session(model_id=model_id):
         person = TestPerson("Bob", 25)
 
         # Test that mified object methods work
@@ -187,12 +198,12 @@ def test_contextual_session_with_mified_object_methods():
         # Test format_for_llm
         llm_format = person.format_for_llm()
         assert llm_format is not None
-        assert hasattr(llm_format, 'args')
+        assert hasattr(llm_format, "args")
 
 
-def test_session_methods_with_mified_objects():
+def test_session_methods_with_mified_objects(model_id):
     """Test using session query/transform methods with mified objects."""
-    with start_session() as m:
+    with start_session(model_id=model_id) as m:
         person = TestPerson("Charlie", 35)
 
         # Test session query method
@@ -205,11 +216,11 @@ def test_session_methods_with_mified_objects():
         assert transform_result is not None
 
         # Verify mified objects have query/transform object creation methods
-        assert hasattr(person, 'get_query_object')
-        assert hasattr(person, 'get_transform_object')
-        assert hasattr(person, '_query_type')
-        assert hasattr(person, '_transform_type')
+        assert hasattr(person, "get_query_object")
+        assert hasattr(person, "get_transform_object")
+        assert hasattr(person, "_query_type")
+        assert hasattr(person, "_transform_type")
 
 
 if __name__ == "__main__":
-    pytest.main([__file__])
\ No newline at end of file
+    pytest.main([__file__])
diff --git a/test/stdlib_basics/test_genslot.py b/test/stdlib_basics/test_genslot.py
index 8aa51828..58cc974a 100644
--- a/test/stdlib_basics/test_genslot.py
+++ b/test/stdlib_basics/test_genslot.py
@@ -33,7 +33,7 @@ def test_func(session):
     write_email_component = write_me_an_email(session)
     assert isinstance(write_email_component, str)
 
-
+@pytest.mark.llm
 def test_sentiment_output(classify_sentiment_output):
     assert classify_sentiment_output in ["positive", "negative"]
 
diff --git a/test/stdlib_basics/test_session.py b/test/stdlib_basics/test_session.py
index 351c08d8..7c592ac4 100644
--- a/test/stdlib_basics/test_session.py
+++ b/test/stdlib_basics/test_session.py
@@ -1,4 +1,5 @@
 import pytest
+
 from mellea.stdlib.base import ModelOutputThunk
 from mellea.stdlib.session import start_session
 
@@ -10,13 +11,21 @@ def test_start_session_watsonx():
     assert response.value is not None
 
 
-def test_start_session_openai_with_kwargs():
-    m = start_session(
+def test_start_session_openai_with_kwargs(gh_run):
+    if gh_run == 1:
+        m = start_session(
         "openai",
-        model_id="granite3.3:8b",
+        model_id="llama3.2:1b",
         base_url="http://localhost:11434/v1",
         api_key="ollama",
     )
+    else:
+        m = start_session(
+            "openai",
+            model_id="granite3.3:8b",
+            base_url="http://localhost:11434/v1",
+            api_key="ollama",
+        )
     response = m.instruct("testing")
     assert isinstance(response, ModelOutputThunk)
     assert response.value is not None

From b3189866fbb0ca79a4aebd12460ea1e9b4ccd80c Mon Sep 17 00:00:00 2001
From: Avinash Balakrishnan <avinash2692@gmail.com>
Date: Mon, 25 Aug 2025 09:30:59 -0700
Subject: [PATCH 03/23] adding llama 1b in model ids

---
 mellea/backends/model_ids.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mellea/backends/model_ids.py b/mellea/backends/model_ids.py
index 40db617f..a750998c 100644
--- a/mellea/backends/model_ids.py
+++ b/mellea/backends/model_ids.py
@@ -89,6 +89,10 @@ class ModelIdentifier:
     ollama_name="llama-guard3:1b", hf_model_name="unsloth/Llama-Guard-3-1B"
 )
 
+META_LLAMA_3_2_1B = ModelIdentifier(
+    ollama_name="llama3.2:1b", hf_model_name="unsloth/Llama-3.2-1B"
+)
+
 ########################
 #### Mistral models ####
 ########################

From 733b53c6da76f6dfe7e090b108e84b298693c11a Mon Sep 17 00:00:00 2001
From: Avinash Balakrishnan <avinash2692@gmail.com>
Date: Mon, 25 Aug 2025 09:31:40 -0700
Subject: [PATCH 04/23] adding tests to config and workflow

---
 .github/workflows/run tests.yaml | 41 ++++++++++++++++++++++++++++++++
 pyproject.toml                   |  5 ++++
 2 files changed, 46 insertions(+)
 create mode 100644 .github/workflows/run tests.yaml

diff --git a/.github/workflows/run tests.yaml b/.github/workflows/run tests.yaml
new file mode 100644
index 00000000..3a7b28ac
--- /dev/null
+++ b/.github/workflows/run tests.yaml	
@@ -0,0 +1,41 @@
+name: Test non-llm components
+
+on:
+  workflow_run:
+    workflows: ["Verify Code Quality"]
+    types:
+      - completed
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.event.pull_request.number || github.ref_name }}
+  cancel-in-progress: true
+
+jobs:
+  tests:
+    if: ${{ github.event.workflow_run.conclusion == 'success' }}
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    strategy:
+      matrix:
+        python-version: ['3.10', '3.11', '3.12'] # Need to add 3.13 once we resolve outlines issues.
+    env:
+      GITHUB_ACTIONS: 1
+    
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install uv and set the python version
+        uses: astral-sh/setup-uv@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          enable-cache: true
+      - name: Install dependencies
+        run: uv sync --frozen --all-extras --group dev
+      - name: Install Ollama
+        run: curl -fsSL https://ollama.com/install.sh | sh
+      - name: Pull Llama 3.2:1b model
+        run: ollama pull llama3.2:1b
+      - name: Start serving the model
+        run: nohup ollama serve &
+
+      - name: Run Tests
+        run: uv run -m pytest -v test
diff --git a/pyproject.toml b/pyproject.toml
index 1548e1d9..3eebf620 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -161,3 +161,8 @@ skip = 'requirements.txt,uv.lock'
 [tool.mypy]
 disable_error_code = ["empty-body", "import-untyped"]
 python_version = "3.10"
+
+[tool.pytest.ini_options]
+markers = [
+    "llm: Marks the test as needing an exact output from an LLM (deselect with '-m \" not llm\"'); this depends on the session.backend.model_id"
+]
\ No newline at end of file

From 98661573cb84ba101d4e98e9fc72326235670a24 Mon Sep 17 00:00:00 2001
From: Avinash Balakrishnan <avinash2692@gmail.com>
Date: Mon, 25 Aug 2025 09:43:07 -0700
Subject: [PATCH 05/23] renaming workflow

---
 .github/workflows/{run tests.yaml => run_llm_tests.yaml} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename .github/workflows/{run tests.yaml => run_llm_tests.yaml} (100%)

diff --git a/.github/workflows/run tests.yaml b/.github/workflows/run_llm_tests.yaml
similarity index 100%
rename from .github/workflows/run tests.yaml
rename to .github/workflows/run_llm_tests.yaml

From 1eccc3327c1bef83f9db6135be706358b2b72025 Mon Sep 17 00:00:00 2001
From: Avinash Balakrishnan <avinash2692@gmail.com>
Date: Mon, 25 Aug 2025 10:03:49 -0700
Subject: [PATCH 06/23] small changes

---
 .github/workflows/run_llm_tests.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/run_llm_tests.yaml b/.github/workflows/run_llm_tests.yaml
index 3a7b28ac..55f80fb9 100644
--- a/.github/workflows/run_llm_tests.yaml
+++ b/.github/workflows/run_llm_tests.yaml
@@ -2,9 +2,9 @@ name: Test non-llm components
 
 on:
   workflow_run:
-    workflows: ["Verify Code Quality"]
-    types:
-      - completed
+    workflows: [Verify Code Quality]
+    types: [completed]
+      
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.event.pull_request.number || github.ref_name }}

From 0ae217d76f36865fba4d3b1c8b0727479a5a27e6 Mon Sep 17 00:00:00 2001
From: Avinash Balakrishnan <avinash2692@gmail.com>
Date: Mon, 25 Aug 2025 11:21:41 -0700
Subject: [PATCH 07/23] trying to test workflow

---
 .github/workflows/run_llm_tests.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/run_llm_tests.yaml b/.github/workflows/run_llm_tests.yaml
index 55f80fb9..33bc0785 100644
--- a/.github/workflows/run_llm_tests.yaml
+++ b/.github/workflows/run_llm_tests.yaml
@@ -1,6 +1,7 @@
 name: Test non-llm components
 
 on:
+  workflow_dispatch: {}
   workflow_run:
     workflows: [Verify Code Quality]
     types: [completed]

From aa7760afe86833246e5e06f1f6fc9e1cd8e3a13a Mon Sep 17 00:00:00 2001
From: Avinash Balakrishnan <avinash2692@gmail.com>
Date: Mon, 25 Aug 2025 11:56:45 -0700
Subject: [PATCH 08/23] adding the tests to the quality workflow

---
 .github/workflows/quality.yml        | 16 +++++++++--
 .github/workflows/run_llm_tests.yaml | 42 ----------------------------
 2 files changed, 14 insertions(+), 44 deletions(-)
 delete mode 100644 .github/workflows/run_llm_tests.yaml

diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml
index 7634b8be..d2bdb5d3 100644
--- a/.github/workflows/quality.yml
+++ b/.github/workflows/quality.yml
@@ -31,9 +31,21 @@ jobs:
           path: ~/.cache/pre-commit
           key: pre-commit|${{ env.PY }}|${{ hashFiles('.pre-commit-config.yaml') }}
       - name: Install dependencies
-        run: uv sync --frozen --all-extras
+        run: uv sync --frozen --all-extras --group dev
       - name: Check style and run tests
         run: pre-commit run --all-files
-      - name: Send failure message
+      - name: Send failure message pre-commit
         if: failure()  # This step will only run if a previous step failed
         run: echo "The quality verification failed. Please run precommit "
+      - name: Install Ollama
+        run: curl -fsSL https://ollama.com/install.sh | sh
+      - name: Pull Llama 3.2:1b model
+        run: ollama pull llama3.2:1b
+      - name: Start serving the model
+        run: nohup ollama serve &
+      - name: Run Tests
+        run: uv run -m pytest -v test
+      - name: Send failure message tests
+        if: failure()  # This step will only run if a previous step failed
+        run: echo "Tests failed. Please verify that tests are working locally."
+      
diff --git a/.github/workflows/run_llm_tests.yaml b/.github/workflows/run_llm_tests.yaml
deleted file mode 100644
index 33bc0785..00000000
--- a/.github/workflows/run_llm_tests.yaml
+++ /dev/null
@@ -1,42 +0,0 @@
-name: Test non-llm components
-
-on:
-  workflow_dispatch: {}
-  workflow_run:
-    workflows: [Verify Code Quality]
-    types: [completed]
-      
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.event.pull_request.number || github.ref_name }}
-  cancel-in-progress: true
-
-jobs:
-  tests:
-    if: ${{ github.event.workflow_run.conclusion == 'success' }}
-    runs-on: ubuntu-latest
-    timeout-minutes: 30
-    strategy:
-      matrix:
-        python-version: ['3.10', '3.11', '3.12'] # Need to add 3.13 once we resolve outlines issues.
-    env:
-      GITHUB_ACTIONS: 1
-    
-    steps:
-      - uses: actions/checkout@v4
-      - name: Install uv and set the python version
-        uses: astral-sh/setup-uv@v5
-        with:
-          python-version: ${{ matrix.python-version }}
-          enable-cache: true
-      - name: Install dependencies
-        run: uv sync --frozen --all-extras --group dev
-      - name: Install Ollama
-        run: curl -fsSL https://ollama.com/install.sh | sh
-      - name: Pull Llama 3.2:1b model
-        run: ollama pull llama3.2:1b
-      - name: Start serving the model
-        run: nohup ollama serve &
-
-      - name: Run Tests
-        run: uv run -m pytest -v test

From 89b4834ac910494209aea787e18c0dddf91f26d0 Mon Sep 17 00:00:00 2001
From: Avinash Balakrishnan <avinash2692@gmail.com>
Date: Mon, 25 Aug 2025 12:01:51 -0700
Subject: [PATCH 09/23] adding env variable to disable tests

---
 .github/workflows/quality.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml
index d2bdb5d3..42517e22 100644
--- a/.github/workflows/quality.yml
+++ b/.github/workflows/quality.yml
@@ -17,6 +17,8 @@ jobs:
     strategy:
       matrix:
         python-version: ['3.10', '3.11', '3.12'] # Need to add 3.13 once we resolve outlines issues.
+    env:
+      GITHUB_ACTIONS: 1
     steps:
       - uses: actions/checkout@v4
       - name: Install uv and set the python version

From 8e6191056976cbbb6a197012d2ac2deba14bd870 Mon Sep 17 00:00:00 2001
From: Avinash Balakrishnan <avinash2692@gmail.com>
Date: Mon, 25 Aug 2025 15:57:03 -0700
Subject: [PATCH 10/23] chaning marker name llm -> qualitative

---
 pyproject.toml   |  2 +-
 test/conftest.py | 26 ++------------------------
 2 files changed, 3 insertions(+), 25 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3eebf620..6614b40a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -164,5 +164,5 @@ python_version = "3.10"
 
 [tool.pytest.ini_options]
 markers = [
-    "llm: Marks the test as needing an exact output from an LLM (deselect with '-m \" not llm\"'); this depends on the session.backend.model_id"
+    "qualitative: Marks the test as needing an exact output from an LLM; set by an ENV variable for GITHUB_ACTIONS. All tests marked with this will xfail in CI/CD"
 ]
\ No newline at end of file
diff --git a/test/conftest.py b/test/conftest.py
index e81414f2..7ded3d8b 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -15,34 +15,12 @@ def gh_run() -> int:
 
 def pytest_runtest_setup(item):
     # Runs tests *not* marked with `@pytest.mark.llm` to run normally.
-    if not item.get_closest_marker("llm"):
+    if not item.get_closest_marker("qualitative"):
         return
 
     gh_run = int(os.environ.get("GITHUB_ACTION", 0))
 
     if gh_run == 1:
         pytest.xfail(
-            reason="Skipping LLM test: got env variable GITHUB_ACTION == 1. Used only in gh workflows."
+            reason="Skipping qualitative test: got env variable GITHUB_ACTION == 1. Used only in gh workflows."
         )
-
-    # # Check if there is a session fixture.
-    # try:
-    #     session: MelleaSession = item._request.getfixturevalue("m_session")
-    # except Exception:
-    #     # Skip test cause all llm marked tests need a session fixture.
-    #     pytest.skip("`llm` marked tests requires a `m_session` fixture.")
-    # # Get the Ollama name.
-    # if isinstance(session.backend, OllamaModelBackend) or isinstance(session.backend, OpenAIBackend):
-    #     model_id = session.backend.model_id.ollama_name
-    #     # Skip tests of the model name is llama 1b
-    #     if model_id == "llama3.2:1b":
-    #         pytest.skip(
-    #             "Skipping LLM test: got model_id == llama3.2:1b in ollama. Used only in gh workflows."
-    #         )
-    # elif isinstance(session.backend, LocalHFBackend):
-    #     model_id = session.backend.model_id.hf_model_name
-    #     # Skip tests of the model name is llama 1b
-    #     if model_id == "unsloth/Llama-3.2-1B":
-    #         pytest.skip(
-    #             "Skipping LLM test: got model_id == unsloth/Llama-3.2-1B in hf. Used only in gh workflows."
-    #         )

From cb754f2dcfed8392712a2a61c21dbd1b7ea8c8e9 Mon Sep 17 00:00:00 2001
From: Avinash Balakrishnan <avinash2692@gmail.com>
Date: Mon, 25 Aug 2025 15:58:59 -0700
Subject: [PATCH 11/23] changing test markers

---
 .github/workflows/quality.yml                 |  2 +-
 test/backends/test_huggingface.py             | 23 +++++++++----------
 test/backends/test_ollama.py                  | 10 ++++----
 test/backends/test_openai_ollama.py           |  6 ++---
 test/backends/test_watsonx.py                 |  8 +++----
 test/conftest.py                              |  2 +-
 test/stdlib_basics/test_contextual_session.py |  2 +-
 test/stdlib_basics/test_genslot.py            |  2 +-
 8 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml
index 42517e22..6c6c3da8 100644
--- a/.github/workflows/quality.yml
+++ b/.github/workflows/quality.yml
@@ -18,7 +18,7 @@ jobs:
       matrix:
         python-version: ['3.10', '3.11', '3.12'] # Need to add 3.13 once we resolve outlines issues.
     env:
-      GITHUB_ACTIONS: 1
+      GITHUB_ACTION: 1
     steps:
       - uses: actions/checkout@v4
       - name: Install uv and set the python version
diff --git a/test/backends/test_huggingface.py b/test/backends/test_huggingface.py
index f91711c2..68c907d9 100644
--- a/test/backends/test_huggingface.py
+++ b/test/backends/test_huggingface.py
@@ -21,7 +21,6 @@
 @pytest.fixture(scope="module")
 def backend():
     """Shared HuggingFace backend for all tests in this module."""
-    # TODO: find a smalle 1B model to do Alora stuff on github actions.
     backend = LocalHFBackend(
         model_id="ibm-granite/granite-3.2-8b-instruct",
         formatter=TemplateFormatter(model_id="ibm-granite/granite-4.0-tiny-preview"),
@@ -38,7 +37,7 @@ def session(backend):
     yield session
     session.reset()
 
-@pytest.mark.llm
+@pytest.mark.qualitative
 def test_system_prompt(session):
     result = session.chat(
         "Where are we going?",
@@ -46,7 +45,7 @@ def test_system_prompt(session):
     )
     print(result)
 
-@pytest.mark.llm
+@pytest.mark.qualitative
 def test_constraint_alora(session, backend):
     answer = session.instruct(
         "Corporate wants you to find the difference between these two strings: aaaaaaaaaa aaaaabaaaa. Be concise and don't write code to answer the question.",
@@ -64,7 +63,7 @@ def test_constraint_alora(session, backend):
     )
     assert alora_output in ["Y", "N"], alora_output
 
-@pytest.mark.llm
+@pytest.mark.qualitative
 def test_constraint_lora_with_requirement(session, backend):
     answer = session.instruct(
         "Corporate wants you to find the difference between these two strings: aaaaaaaaaa aaaaabaaaa"
@@ -80,7 +79,7 @@ def test_constraint_lora_with_requirement(session, backend):
     assert isinstance(val_result, ValidationResult)
     assert str(val_result.reason) in ["Y", "N"]
 
-@pytest.mark.llm
+@pytest.mark.qualitative
 def test_constraint_lora_override(session, backend):
     backend.default_to_constraint_checking_alora = False  # type: ignore
     answer = session.instruct(
@@ -95,7 +94,7 @@ def test_constraint_lora_override(session, backend):
     assert isinstance(default_output_to_bool(str(val_result.reason)), bool)
     backend.default_to_constraint_checking_alora = True
 
-@pytest.mark.llm
+@pytest.mark.qualitative
 def test_constraint_lora_override_does_not_override_alora(session, backend):
     backend.default_to_constraint_checking_alora = False  # type: ignore
     answer = session.instruct(
@@ -112,7 +111,7 @@ def test_constraint_lora_override_does_not_override_alora(session, backend):
     assert str(val_result.reason) in ["Y", "N"]
     backend.default_to_constraint_checking_alora = True
 
-@pytest.mark.llm
+@pytest.mark.qualitative
 def test_llmaj_req_does_not_use_alora(session, backend):
     backend.default_to_constraint_checking_alora = True  # type: ignore
     answer = session.instruct(
@@ -128,12 +127,12 @@ def test_llmaj_req_does_not_use_alora(session, backend):
     assert isinstance(val_result, ValidationResult)
     assert str(val_result.reason) not in ["Y", "N"]
 
-@pytest.mark.llm
+@pytest.mark.qualitative
 def test_instruct(session):
     result = session.instruct("Compute 1+1.")
     print(result)
 
-@pytest.mark.llm
+@pytest.mark.qualitative
 def test_multiturn(session):
     session.instruct("Compute 1+1")
     beta = session.instruct(
@@ -143,7 +142,7 @@ def test_multiturn(session):
     words = session.instruct("Now list five English words that start with that letter.")
     print(words)
 
-@pytest.mark.llm
+@pytest.mark.qualitative
 def test_format(session):
     class Person(pydantic.BaseModel):
         name: str
@@ -173,7 +172,7 @@ class Email(pydantic.BaseModel):
         "The email address should be at example.com"
     )
 
-@pytest.mark.llm
+@pytest.mark.qualitative
 def test_generate_from_raw(session):
     prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"]
 
@@ -183,7 +182,7 @@ def test_generate_from_raw(session):
 
     assert len(results) == len(prompts)
 
-@pytest.mark.llm
+@pytest.mark.qualitative
 def test_generate_from_raw_with_format(session):
     prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"]
 
diff --git a/test/backends/test_ollama.py b/test/backends/test_ollama.py
index fdba285a..b90d93fb 100644
--- a/test/backends/test_ollama.py
+++ b/test/backends/test_ollama.py
@@ -18,7 +18,7 @@ def session():
     session.reset()
 
 
-@pytest.mark.llm
+@pytest.mark.qualitative
 def test_simple_instruct(session):
     result = session.instruct(
         "Write an email to Hendrik trying to sell him self-sealing stembolts."
@@ -28,7 +28,7 @@ def test_simple_instruct(session):
     assert result._meta["chat_response"].message.role == "assistant"
 
 
-@pytest.mark.llm
+@pytest.mark.qualitative
 def test_instruct_with_requirement(session):
     response = session.instruct(
         "Write an email to Hendrik convincing him to buy some self-sealing stembolts."
@@ -51,14 +51,14 @@ def test_instruct_with_requirement(session):
     )
     print(results)
 
-@pytest.mark.llm
+@pytest.mark.qualitative
 def test_chat(session):
     output_message = session.chat("What is 1+1?")
     assert "2" in output_message.content, (
         f"Expected a message with content containing 2 but found {output_message}"
     )
 
-@pytest.mark.llm
+@pytest.mark.qualitative
 def test_format(session):
     class Person(pydantic.BaseModel):
         name: str
@@ -91,7 +91,7 @@ class Email(pydantic.BaseModel):
     # assert email.to.email_address.endswith("example.com")
     pass
 
-@pytest.mark.llm
+@pytest.mark.qualitative
 def test_generate_from_raw(session):
     prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"]
 
diff --git a/test/backends/test_openai_ollama.py b/test/backends/test_openai_ollama.py
index 200b539d..f3c928c3 100644
--- a/test/backends/test_openai_ollama.py
+++ b/test/backends/test_openai_ollama.py
@@ -37,13 +37,13 @@ def m_session(backend):
     yield session
     session.reset()
 
-@pytest.mark.llm
+@pytest.mark.qualitative
 def test_instruct(m_session):
     result = m_session.instruct("Compute 1+1.")
     assert isinstance(result, ModelOutputThunk)
     assert "2" in result.value  # type: ignore
 
-@pytest.mark.llm
+@pytest.mark.qualitative
 def test_multiturn(m_session):
     m_session.instruct("What is the capital of France?")
     answer = m_session.instruct("Tell me the answer to the previous question.")
@@ -64,7 +64,7 @@ def test_multiturn(m_session):
     #     assert "granite3.3:8b" in result.value
     #     self.m.reset()
 
-@pytest.mark.llm
+@pytest.mark.qualitative
 def test_format(m_session):
     class Person(pydantic.BaseModel):
         name: str
diff --git a/test/backends/test_watsonx.py b/test/backends/test_watsonx.py
index af8634b6..255f2164 100644
--- a/test/backends/test_watsonx.py
+++ b/test/backends/test_watsonx.py
@@ -29,21 +29,21 @@ def session(backend):
     session.reset()
 
 
-@pytest.mark.llm
+@pytest.mark.qualitative
 def test_instruct(session):
     result = session.instruct("Compute 1+1.")
     assert isinstance(result, ModelOutputThunk)
     assert "2" in result.value  # type: ignore
 
 
-@pytest.mark.llm
+@pytest.mark.qualitative
 def test_multiturn(session):
     session.instruct("What is the capital of France?")
     answer = session.instruct("Tell me the answer to the previous question.")
     assert "Paris" in answer.value  # type: ignore
 
 
-@pytest.mark.llm
+@pytest.mark.qualitative
 def test_format(session):
     class Person(pydantic.BaseModel):
         name: str
@@ -77,7 +77,7 @@ class Email(pydantic.BaseModel):
     pass
 
 
-@pytest.mark.llm
+@pytest.mark.qualitative
 def test_generate_from_raw(session):
     prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"]
 
diff --git a/test/conftest.py b/test/conftest.py
index 7ded3d8b..906ab5f7 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -14,7 +14,7 @@ def gh_run() -> int:
 
 
 def pytest_runtest_setup(item):
-    # Runs tests *not* marked with `@pytest.mark.llm` to run normally.
+    # Runs tests *not* marked with `@pytest.mark.qualitative` to run normally.
     if not item.get_closest_marker("qualitative"):
         return
 
diff --git a/test/stdlib_basics/test_contextual_session.py b/test/stdlib_basics/test_contextual_session.py
index 4bff317f..1a290149 100644
--- a/test/stdlib_basics/test_contextual_session.py
+++ b/test/stdlib_basics/test_contextual_session.py
@@ -74,7 +74,7 @@ def test_generative_with_contextual_session(model_id):
         assert isinstance(summary, str)
         assert len(summary) > 0
 
-@pytest.mark.llm
+@pytest.mark.qualitative
 def test_generative_backward_compatibility(model_id):
     """Test that generative slots still work with explicit session parameter."""
     with start_session(model_id=model_id) as m:
diff --git a/test/stdlib_basics/test_genslot.py b/test/stdlib_basics/test_genslot.py
index 58cc974a..f47b7577 100644
--- a/test/stdlib_basics/test_genslot.py
+++ b/test/stdlib_basics/test_genslot.py
@@ -33,7 +33,7 @@ def test_func(session):
     write_email_component = write_me_an_email(session)
     assert isinstance(write_email_component, str)
 
-@pytest.mark.llm
+@pytest.mark.qualitative
 def test_sentiment_output(classify_sentiment_output):
     assert classify_sentiment_output in ["positive", "negative"]
 

From 26d5dfcef123c75f1cff0b654307f8a6564ee9d4 Mon Sep 17 00:00:00 2001
From: Avinash Balakrishnan <avinash2692@gmail.com>
Date: Mon, 25 Aug 2025 16:15:08 -0700
Subject: [PATCH 12/23] addressing PR comments

---
 test/backends/test_huggingface.py             | 16 +++++++++++-----
 test/stdlib_basics/test_contextual_session.py |  2 +-
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/test/backends/test_huggingface.py b/test/backends/test_huggingface.py
index 68c907d9..08213667 100644
--- a/test/backends/test_huggingface.py
+++ b/test/backends/test_huggingface.py
@@ -37,7 +37,7 @@ def session(backend):
     yield session
     session.reset()
 
-@pytest.mark.qualitative
+
 def test_system_prompt(session):
     result = session.chat(
         "Where are we going?",
@@ -45,7 +45,7 @@ def test_system_prompt(session):
     )
     print(result)
 
-@pytest.mark.qualitative
+
 def test_constraint_alora(session, backend):
     answer = session.instruct(
         "Corporate wants you to find the difference between these two strings: aaaaaaaaaa aaaaabaaaa. Be concise and don't write code to answer the question.",
@@ -63,7 +63,7 @@ def test_constraint_alora(session, backend):
     )
     assert alora_output in ["Y", "N"], alora_output
 
-@pytest.mark.qualitative
+
 def test_constraint_lora_with_requirement(session, backend):
     answer = session.instruct(
         "Corporate wants you to find the difference between these two strings: aaaaaaaaaa aaaaabaaaa"
@@ -79,6 +79,7 @@ def test_constraint_lora_with_requirement(session, backend):
     assert isinstance(val_result, ValidationResult)
     assert str(val_result.reason) in ["Y", "N"]
 
+
 @pytest.mark.qualitative
 def test_constraint_lora_override(session, backend):
     backend.default_to_constraint_checking_alora = False  # type: ignore
@@ -94,6 +95,7 @@ def test_constraint_lora_override(session, backend):
     assert isinstance(default_output_to_bool(str(val_result.reason)), bool)
     backend.default_to_constraint_checking_alora = True
 
+
 @pytest.mark.qualitative
 def test_constraint_lora_override_does_not_override_alora(session, backend):
     backend.default_to_constraint_checking_alora = False  # type: ignore
@@ -111,6 +113,7 @@ def test_constraint_lora_override_does_not_override_alora(session, backend):
     assert str(val_result.reason) in ["Y", "N"]
     backend.default_to_constraint_checking_alora = True
 
+
 @pytest.mark.qualitative
 def test_llmaj_req_does_not_use_alora(session, backend):
     backend.default_to_constraint_checking_alora = True  # type: ignore
@@ -127,11 +130,12 @@ def test_llmaj_req_does_not_use_alora(session, backend):
     assert isinstance(val_result, ValidationResult)
     assert str(val_result.reason) not in ["Y", "N"]
 
-@pytest.mark.qualitative
+
 def test_instruct(session):
     result = session.instruct("Compute 1+1.")
     print(result)
 
+
 @pytest.mark.qualitative
 def test_multiturn(session):
     session.instruct("Compute 1+1")
@@ -142,6 +146,7 @@ def test_multiturn(session):
     words = session.instruct("Now list five English words that start with that letter.")
     print(words)
 
+
 @pytest.mark.qualitative
 def test_format(session):
     class Person(pydantic.BaseModel):
@@ -172,7 +177,7 @@ class Email(pydantic.BaseModel):
         "The email address should be at example.com"
     )
 
-@pytest.mark.qualitative
+
 def test_generate_from_raw(session):
     prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"]
 
@@ -182,6 +187,7 @@ def test_generate_from_raw(session):
 
     assert len(results) == len(prompts)
 
+
 @pytest.mark.qualitative
 def test_generate_from_raw_with_format(session):
     prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"]
diff --git a/test/stdlib_basics/test_contextual_session.py b/test/stdlib_basics/test_contextual_session.py
index 1a290149..a142f879 100644
--- a/test/stdlib_basics/test_contextual_session.py
+++ b/test/stdlib_basics/test_contextual_session.py
@@ -160,7 +160,7 @@ def test_all_convenience_functions(model_id):
 
 def test_session_with_parameters(model_id):
     """Test contextual session with custom parameters."""
-    with start_session(backend_name="ollama", model_id=META_LLAMA_3_2_1B) as m:
+    with start_session(backend_name="ollama", model_id=model_id) as m:
         result = instruct("test with parameters")
         assert isinstance(result, ModelOutputThunk)
         assert isinstance(m, MelleaSession)

From 6102d2e7fed70a42b954c78b00b0b498977f7570 Mon Sep 17 00:00:00 2001
From: Avinash Balakrishnan <avinash2692@gmail.com>
Date: Mon, 25 Aug 2025 16:51:59 -0700
Subject: [PATCH 13/23] changing ollama port

---
 test/backends/test_openai_ollama.py | 6 ++++--
 test/stdlib_basics/test_session.py  | 4 ++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/test/backends/test_openai_ollama.py b/test/backends/test_openai_ollama.py
index f3c928c3..def41004 100644
--- a/test/backends/test_openai_ollama.py
+++ b/test/backends/test_openai_ollama.py
@@ -1,4 +1,6 @@
 # test/rits_backend_tests/test_openai_integration.py
+import os
+
 import pydantic
 import pytest
 from typing_extensions import Annotated
@@ -18,14 +20,14 @@ def backend(gh_run: int):
         return OpenAIBackend(
         model_id=META_LLAMA_3_2_1B,
         formatter=TemplateFormatter(model_id=META_LLAMA_3_2_1B),
-        base_url="http://localhost:11434/v1",
+        base_url=f"http://{os.environ.get('OLLAMA_HOST', 'localhost:11434')}/v1",
         api_key="ollama",
     )
     else:
         return OpenAIBackend(
             model_id="granite3.3:8b",
             formatter=TemplateFormatter(model_id="ibm-granite/granite-3.2-8b-instruct"),
-            base_url="http://localhost:11434/v1",
+            base_url=f"http://{os.environ.get('OLLAMA_HOST', 'localhost:11434')}/v1",
             api_key="ollama",
         )
 
diff --git a/test/stdlib_basics/test_session.py b/test/stdlib_basics/test_session.py
index 7c592ac4..6ce61c14 100644
--- a/test/stdlib_basics/test_session.py
+++ b/test/stdlib_basics/test_session.py
@@ -16,14 +16,14 @@ def test_start_session_openai_with_kwargs(gh_run):
         m = start_session(
         "openai",
         model_id="llama3.2:1b",
-        base_url="http://localhost:11434/v1",
+        base_url=f"http://{os.environ.get('OLLAMA_HOST', 'localhost:11434')}/v1",
         api_key="ollama",
     )
     else:
         m = start_session(
             "openai",
             model_id="granite3.3:8b",
-            base_url="http://localhost:11434/v1",
+            base_url=f"http://{os.environ.get('OLLAMA_HOST', 'localhost:11434')}/v1",
             api_key="ollama",
         )
     response = m.instruct("testing")

From 752d43d5acc6537428442b26331f09f9eede4f19 Mon Sep 17 00:00:00 2001
From: Avinash Balakrishnan <avinash2692@gmail.com>
Date: Mon, 25 Aug 2025 16:58:43 -0700
Subject: [PATCH 14/23] changing ollama port

---
 .github/workflows/quality.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml
index 6c6c3da8..7e41015f 100644
--- a/.github/workflows/quality.yml
+++ b/.github/workflows/quality.yml
@@ -19,6 +19,7 @@ jobs:
         python-version: ['3.10', '3.11', '3.12'] # Need to add 3.13 once we resolve outlines issues.
     env:
       GITHUB_ACTION: 1
+      OLLAMA_HOST: "127.0.0.1:5000"
     steps:
       - uses: actions/checkout@v4
       - name: Install uv and set the python version

From c0c76a4199a71a8a2efc606ea15ac61b1802872a Mon Sep 17 00:00:00 2001
From: Avinash Balakrishnan <avinash2692@gmail.com>
Date: Mon, 25 Aug 2025 17:03:17 -0700
Subject: [PATCH 15/23] changing ollama order

---
 .github/workflows/quality.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml
index 7e41015f..1e3a97ba 100644
--- a/.github/workflows/quality.yml
+++ b/.github/workflows/quality.yml
@@ -42,10 +42,11 @@ jobs:
         run: echo "The quality verification failed. Please run precommit "
       - name: Install Ollama
         run: curl -fsSL https://ollama.com/install.sh | sh
+      - name: Start serving ollama
+        run: nohup ollama serve &
       - name: Pull Llama 3.2:1b model
         run: ollama pull llama3.2:1b
-      - name: Start serving the model
-        run: nohup ollama serve &
+
       - name: Run Tests
         run: uv run -m pytest -v test
       - name: Send failure message tests

From 7f9f8ced73fc6e9e40327435b8ae6daf1d8c2082 Mon Sep 17 00:00:00 2001
From: Avinash Balakrishnan <avinash2692@gmail.com>
Date: Mon, 25 Aug 2025 17:11:08 -0700
Subject: [PATCH 16/23] skipping hf tests till we have a 1b alora

---
 test/backends/test_huggingface.py | 10 +++++-----
 test/conftest.py                  |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/backends/test_huggingface.py b/test/backends/test_huggingface.py
index 08213667..6859099a 100644
--- a/test/backends/test_huggingface.py
+++ b/test/backends/test_huggingface.py
@@ -37,7 +37,7 @@ def session(backend):
     yield session
     session.reset()
 
-
+@pytest.mark.qualitative
 def test_system_prompt(session):
     result = session.chat(
         "Where are we going?",
@@ -45,7 +45,7 @@ def test_system_prompt(session):
     )
     print(result)
 
-
+@pytest.mark.qualitative
 def test_constraint_alora(session, backend):
     answer = session.instruct(
         "Corporate wants you to find the difference between these two strings: aaaaaaaaaa aaaaabaaaa. Be concise and don't write code to answer the question.",
@@ -63,7 +63,7 @@ def test_constraint_alora(session, backend):
     )
     assert alora_output in ["Y", "N"], alora_output
 
-
+@pytest.mark.qualitative
 def test_constraint_lora_with_requirement(session, backend):
     answer = session.instruct(
         "Corporate wants you to find the difference between these two strings: aaaaaaaaaa aaaaabaaaa"
@@ -130,7 +130,7 @@ def test_llmaj_req_does_not_use_alora(session, backend):
     assert isinstance(val_result, ValidationResult)
     assert str(val_result.reason) not in ["Y", "N"]
 
-
+@pytest.mark.qualitative
 def test_instruct(session):
     result = session.instruct("Compute 1+1.")
     print(result)
@@ -177,7 +177,7 @@ class Email(pydantic.BaseModel):
         "The email address should be at example.com"
     )
 
-
+@pytest.mark.qualitative
 def test_generate_from_raw(session):
     prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"]
 
diff --git a/test/conftest.py b/test/conftest.py
index 906ab5f7..7b4fbff3 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -21,6 +21,6 @@ def pytest_runtest_setup(item):
     gh_run = int(os.environ.get("GITHUB_ACTION", 0))
 
     if gh_run == 1:
-        pytest.xfail(
+        pytest.skip(
             reason="Skipping qualitative test: got env variable GITHUB_ACTION == 1. Used only in gh workflows."
         )

From 773b398d2549ca18f3a68a0dc635ef471e49fd70 Mon Sep 17 00:00:00 2001
From: Jake LoRocco <jake.lorocco@ibm.com>
Date: Tue, 26 Aug 2025 09:05:46 -0400
Subject: [PATCH 17/23] skip rich doc test that takes too much memory

---
 test/stdlib_basics/test_richdocument.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/stdlib_basics/test_richdocument.py b/test/stdlib_basics/test_richdocument.py
index 8809b1a0..6046ea96 100644
--- a/test/stdlib_basics/test_richdocument.py
+++ b/test/stdlib_basics/test_richdocument.py
@@ -93,6 +93,7 @@ def test_empty_table():
     assert table is None, "table should be empty when supplied string is empty"
 
 
+@pytest.mark.skip # Test requires too much memory for smaller machines.
 def test_richdocument_generation(rd: RichDocument):
     m = mellea.start_session(backend_name="hf")
     response = m.chat(rd.to_markdown()[:500] + "\nSummarize the provided document.")

From b432a2a61f94937b0a52232b544c83e35e8b762b Mon Sep 17 00:00:00 2001
From: Jake LoRocco <jake.lorocco@ibm.com>
Date: Tue, 26 Aug 2025 09:07:02 -0400
Subject: [PATCH 18/23] remove unused session functions

---
 mellea/stdlib/session.py | 25 -------------------------
 1 file changed, 25 deletions(-)

diff --git a/mellea/stdlib/session.py b/mellea/stdlib/session.py
index 0d2d6f97..0250b768 100644
--- a/mellea/stdlib/session.py
+++ b/mellea/stdlib/session.py
@@ -414,31 +414,6 @@ def validate(
 
         return rvs
 
-    def req(self, *args, **kwargs):
-        """Shorthand for Requirement.__init__(...)."""
-        return req(*args, **kwargs)
-
-    def check(self, *args, **kwargs):
-        """Shorthand for Requirement.__init__(..., check_only=True)."""
-        return check(*args, **kwargs)
-
-    def load_default_aloras(self):
-        """Loads the default Aloras for this model, if they exist and if the backend supports."""
-        from mellea.backends.huggingface import LocalHFBackend
-
-        if self.backend.model_id == IBM_GRANITE_3_2_8B and isinstance(
-            self.backend, LocalHFBackend
-        ):
-            from mellea.backends.aloras.huggingface.granite_aloras import (
-                add_granite_aloras,
-            )
-
-            add_granite_aloras(self.backend)
-            return
-        self._session_logger.warning(
-            "This model/backend combination does not support any aloras."
-        )
-
     def genslot(
         self,
         gen_slot: Component,

From 16e909d3fec052f9d901cc8fbfa6d8b62fcc39e2 Mon Sep 17 00:00:00 2001
From: Avinash Balakrishnan <avinash2692@gmail.com>
Date: Tue, 26 Aug 2025 08:03:54 -0700
Subject: [PATCH 19/23] changing env var name

---
 .github/workflows/quality.yml | 2 +-
 pyproject.toml                | 2 +-
 test/conftest.py              | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml
index 1e3a97ba..d68b457f 100644
--- a/.github/workflows/quality.yml
+++ b/.github/workflows/quality.yml
@@ -18,7 +18,7 @@ jobs:
       matrix:
         python-version: ['3.10', '3.11', '3.12'] # Need to add 3.13 once we resolve outlines issues.
     env:
-      GITHUB_ACTION: 1
+      CICD: 1
       OLLAMA_HOST: "127.0.0.1:5000"
     steps:
       - uses: actions/checkout@v4
diff --git a/pyproject.toml b/pyproject.toml
index 6614b40a..4208cdf8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -164,5 +164,5 @@ python_version = "3.10"
 
 [tool.pytest.ini_options]
 markers = [
-    "qualitative: Marks the test as needing an exact output from an LLM; set by an ENV variable for GITHUB_ACTIONS. All tests marked with this will xfail in CI/CD"
+    "qualitative: Marks the test as needing an exact output from an LLM; set by an ENV variable for CICD. All tests marked with this will xfail in CI/CD"
 ]
\ No newline at end of file
diff --git a/test/conftest.py b/test/conftest.py
index 7b4fbff3..6e5d83c6 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -10,7 +10,7 @@
 
 @pytest.fixture(scope="session")
 def gh_run() -> int:
-    return int(os.environ.get("GITHUB_ACTION", 0))  # type: ignore
+    return int(os.environ.get("CICD", 0))  # type: ignore
 
 
 def pytest_runtest_setup(item):
@@ -18,9 +18,9 @@ def pytest_runtest_setup(item):
     if not item.get_closest_marker("qualitative"):
         return
 
-    gh_run = int(os.environ.get("GITHUB_ACTION", 0))
+    gh_run = int(os.environ.get("CICD", 0))
 
     if gh_run == 1:
         pytest.skip(
-            reason="Skipping qualitative test: got env variable GITHUB_ACTION == 1. Used only in gh workflows."
+            reason="Skipping qualitative test: got env variable CICD == 1. Used only in gh workflows."
         )

From 5c0c3eea6bd40f40755cda57e9aea6b6b29a3272 Mon Sep 17 00:00:00 2001
From: Avinash Balakrishnan <avinash2692@gmail.com>
Date: Tue, 26 Aug 2025 08:20:33 -0700
Subject: [PATCH 20/23] minor changes

---
 test/backends/test_watsonx.py      | 26 ++++++++++++++++----------
 test/stdlib_basics/test_session.py |  2 ++
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/test/backends/test_watsonx.py b/test/backends/test_watsonx.py
index 255f2164..85dedd66 100644
--- a/test/backends/test_watsonx.py
+++ b/test/backends/test_watsonx.py
@@ -15,36 +15,42 @@
 @pytest.fixture(scope="module")
 def backend():
     """Shared Watson backend for all tests in this module."""
-    return WatsonxAIBackend(
+    if os.environ.get("CICD") == 1:
+        pytest.skip("Skipping watsonx tests.")
+    else:
+        return WatsonxAIBackend(
         model_id="ibm/granite-3-3-8b-instruct",
         formatter=TemplateFormatter(model_id="ibm-granite/granite-3.3-8b-instruct"),
     )
 
 
 @pytest.fixture(scope="function")
-def session(backend):
-    """Fresh Watson session for each test."""
-    session = MelleaSession(backend, ctx=LinearContext(is_chat_context=True))
-    yield session
-    session.reset()
+def session(backend: WatsonxAIBackend):
+    if os.environ.get("CICD") == 1:
+        pytest.skip("Skipping watsonx tests.")
+    else:
+        """Fresh Watson session for each test."""
+        session = MelleaSession(backend, ctx=LinearContext(is_chat_context=True))
+        yield session
+        session.reset()
 
 
 @pytest.mark.qualitative
-def test_instruct(session):
+def test_instruct(session: MelleaSession):
     result = session.instruct("Compute 1+1.")
     assert isinstance(result, ModelOutputThunk)
     assert "2" in result.value  # type: ignore
 
 
 @pytest.mark.qualitative
-def test_multiturn(session):
+def test_multiturn(session: MelleaSession):
     session.instruct("What is the capital of France?")
     answer = session.instruct("Tell me the answer to the previous question.")
     assert "Paris" in answer.value  # type: ignore
 
 
 @pytest.mark.qualitative
-def test_format(session):
+def test_format(session: MelleaSession):
     class Person(pydantic.BaseModel):
         name: str
         # it does not support regex patterns in json schema
@@ -78,7 +84,7 @@ class Email(pydantic.BaseModel):
 
 
 @pytest.mark.qualitative
-def test_generate_from_raw(session):
+def test_generate_from_raw(session: MelleaSession):
     prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"]
 
     results = session.backend._generate_from_raw(
diff --git a/test/stdlib_basics/test_session.py b/test/stdlib_basics/test_session.py
index 6ce61c14..8b9ff7d1 100644
--- a/test/stdlib_basics/test_session.py
+++ b/test/stdlib_basics/test_session.py
@@ -1,3 +1,5 @@
+import os
+
 import pytest
 
 from mellea.stdlib.base import ModelOutputThunk

From 4fc84beecb2d81c869b010f6e26288ce59c9ee8d Mon Sep 17 00:00:00 2001
From: Avinash Balakrishnan <avinash2692@gmail.com>
Date: Tue, 26 Aug 2025 08:40:00 -0700
Subject: [PATCH 21/23] ignoring more watsonx for now

---
 test/conftest.py                   |  2 +-
 test/stdlib_basics/test_session.py | 11 +++++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/test/conftest.py b/test/conftest.py
index 6e5d83c6..e95ce41b 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -21,6 +21,6 @@ def pytest_runtest_setup(item):
     gh_run = int(os.environ.get("CICD", 0))
 
     if gh_run == 1:
-        pytest.skip(
+        pytest.xfail(
             reason="Skipping qualitative test: got env variable CICD == 1. Used only in gh workflows."
         )
diff --git a/test/stdlib_basics/test_session.py b/test/stdlib_basics/test_session.py
index 8b9ff7d1..2d91df97 100644
--- a/test/stdlib_basics/test_session.py
+++ b/test/stdlib_basics/test_session.py
@@ -7,10 +7,13 @@
 
 
 def test_start_session_watsonx():
-    m = start_session(backend_name="watsonx")
-    response = m.instruct("testing")
-    assert isinstance(response, ModelOutputThunk)
-    assert response.value is not None
+    if os.environ.get("CICD") == 1:
+        pytest.skip("Skipping watsonx tests.")
+    else:
+        m = start_session(backend_name="watsonx")
+        response = m.instruct("testing")
+        assert isinstance(response, ModelOutputThunk)
+        assert response.value is not None
 
 
 def test_start_session_openai_with_kwargs(gh_run):

From 6b86010600fa9dc85debab0f1ea13fa8e0bf3c36 Mon Sep 17 00:00:00 2001
From: Avinash Balakrishnan <avinash2692@gmail.com>
Date: Tue, 26 Aug 2025 08:50:24 -0700
Subject: [PATCH 22/23] minor changes

---
 test/stdlib_basics/test_session.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/stdlib_basics/test_session.py b/test/stdlib_basics/test_session.py
index 2d91df97..9caa8d6f 100644
--- a/test/stdlib_basics/test_session.py
+++ b/test/stdlib_basics/test_session.py
@@ -6,8 +6,8 @@
 from mellea.stdlib.session import start_session
 
 
-def test_start_session_watsonx():
-    if os.environ.get("CICD") == 1:
+def test_start_session_watsonx(gh_run):
+    if gh_run == 1:
         pytest.skip("Skipping watsonx tests.")
     else:
         m = start_session(backend_name="watsonx")

From e056768decd40b77cf977fcae76b71171f1f8562 Mon Sep 17 00:00:00 2001
From: Jake LoRocco <jake.lorocco@ibm.com>
Date: Tue, 26 Aug 2025 12:38:57 -0400
Subject: [PATCH 23/23] fix non-duplicate member func for mify in python 3.11

---
 mellea/stdlib/mify.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/mellea/stdlib/mify.py b/mellea/stdlib/mify.py
index 639bbc80..5b278f2d 100644
--- a/mellea/stdlib/mify.py
+++ b/mellea/stdlib/mify.py
@@ -132,8 +132,8 @@ def _get_all_fields(self) -> dict[str, Any]:
         if self._fields_exclude:
             fields_exclude = self._fields_exclude
 
-        # This includes fields defined by any superclasses, as long as it's not object.
-        all_fields = _get_non_duplicate_fields(self, object)
+        # This includes fields defined by any superclasses, as long as it's not Protocol.
+        all_fields = _get_non_duplicate_fields(self, Protocol)
 
         # It does matter if include is an empty set. Handle it's cases here.
         if self._fields_include is not None:
@@ -366,18 +366,15 @@ def mification(obj: T) -> T:
 
 
 def _get_non_duplicate_members(
-    object: object, check_duplicates: object
+    obj: object, check_duplicates: object
 ) -> dict[str, Callable]:
     """Returns all methods/functions unique to the object."""
     members = dict(
         inspect.getmembers(
-            object,
+            obj,
             # Checks for ismethod or isfunction because of the methods added from the MifiedProtocol.
-            predicate=lambda x: inspect.ismethod(x)
-            or (
-                inspect.isfunction(x)
-                and x.__name__ not in dict(inspect.getmembers(check_duplicates)).keys()
-            ),
+            predicate=lambda x: (inspect.ismethod(x) or inspect.isfunction(x))
+            and x.__name__ not in dict(inspect.getmembers(check_duplicates)).keys(),
         )
     )
     return members