adding modified tests with optional LLM runs

avinash2692 · avinash2692 · commit 68cf8c681139 · 2025-08-25T09:30:38.000-07:00
diff --git a/test/backends/test_huggingface.py b/test/backends/test_huggingface.py
@@ -21,6 +21,7 @@
 @pytest.fixture(scope="module")
 def backend():
     """Shared HuggingFace backend for all tests in this module."""
+    # TODO: find a smalle 1B model to do Alora stuff on github actions.
     backend = LocalHFBackend(
         model_id="ibm-granite/granite-3.2-8b-instruct",
         formatter=TemplateFormatter(model_id="ibm-granite/granite-4.0-tiny-preview"),
@@ -37,15 +38,15 @@ def session(backend):
     yield session
     session.reset()
 
-
+@pytest.mark.llm
 def test_system_prompt(session):
     result = session.chat(
         "Where are we going?",
         model_options={ModelOption.SYSTEM_PROMPT: "Talk like a pirate."},
     )
     print(result)
 
-
+@pytest.mark.llm
 def test_constraint_alora(session, backend):
     answer = session.instruct(
         "Corporate wants you to find the difference between these two strings: aaaaaaaaaa aaaaabaaaa. Be concise and don't write code to answer the question.",
@@ -63,7 +64,7 @@ def test_constraint_alora(session, backend):
     )
     assert alora_output in ["Y", "N"], alora_output
 
-
+@pytest.mark.llm
 def test_constraint_lora_with_requirement(session, backend):
     answer = session.instruct(
         "Corporate wants you to find the difference between these two strings: aaaaaaaaaa aaaaabaaaa"
@@ -79,7 +80,7 @@ def test_constraint_lora_with_requirement(session, backend):
     assert isinstance(val_result, ValidationResult)
     assert str(val_result.reason) in ["Y", "N"]
 
-
+@pytest.mark.llm
 def test_constraint_lora_override(session, backend):
     backend.default_to_constraint_checking_alora = False  # type: ignore
     answer = session.instruct(
@@ -94,7 +95,7 @@ def test_constraint_lora_override(session, backend):
     assert isinstance(default_output_to_bool(str(val_result.reason)), bool)
     backend.default_to_constraint_checking_alora = True
 
-
+@pytest.mark.llm
 def test_constraint_lora_override_does_not_override_alora(session, backend):
     backend.default_to_constraint_checking_alora = False  # type: ignore
     answer = session.instruct(
@@ -111,7 +112,7 @@ def test_constraint_lora_override_does_not_override_alora(session, backend):
     assert str(val_result.reason) in ["Y", "N"]
     backend.default_to_constraint_checking_alora = True
 
-
+@pytest.mark.llm
 def test_llmaj_req_does_not_use_alora(session, backend):
     backend.default_to_constraint_checking_alora = True  # type: ignore
     answer = session.instruct(
@@ -127,12 +128,12 @@ def test_llmaj_req_does_not_use_alora(session, backend):
     assert isinstance(val_result, ValidationResult)
     assert str(val_result.reason) not in ["Y", "N"]
 
-
+@pytest.mark.llm
 def test_instruct(session):
     result = session.instruct("Compute 1+1.")
     print(result)
 
-
+@pytest.mark.llm
 def test_multiturn(session):
     session.instruct("Compute 1+1")
     beta = session.instruct(
@@ -142,7 +143,7 @@ def test_multiturn(session):
     words = session.instruct("Now list five English words that start with that letter.")
     print(words)
 
-
+@pytest.mark.llm
 def test_format(session):
     class Person(pydantic.BaseModel):
         name: str
@@ -172,7 +173,7 @@ class Email(pydantic.BaseModel):
         "The email address should be at example.com"
     )
 
-
+@pytest.mark.llm
 def test_generate_from_raw(session):
     prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"]
 
@@ -182,7 +183,7 @@ def test_generate_from_raw(session):
 
     assert len(results) == len(prompts)
 
-
+@pytest.mark.llm
 def test_generate_from_raw_with_format(session):
     prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"]
 
diff --git a/test/backends/test_ollama.py b/test/backends/test_ollama.py
@@ -1,11 +1,13 @@
-from mellea import start_session, SimpleContext
-from mellea.stdlib.base import CBlock
-from mellea.stdlib.requirement import Requirement
-import pydantic
 import json
+
+import pydantic
+import pytest
 from typing_extensions import Annotated
+
+from mellea import SimpleContext, start_session
 from mellea.backends.types import ModelOption
-import pytest
+from mellea.stdlib.base import CBlock
+from mellea.stdlib.requirement import Requirement
 
 
 @pytest.fixture(scope="function")
@@ -15,6 +17,8 @@ def session():
     yield session
     session.reset()
 
+
+@pytest.mark.llm
 def test_simple_instruct(session):
     result = session.instruct(
         "Write an email to Hendrik trying to sell him self-sealing stembolts."
@@ -23,6 +27,8 @@ def test_simple_instruct(session):
     assert "chat_response" in result._meta
     assert result._meta["chat_response"].message.role == "assistant"
 
+
+@pytest.mark.llm
 def test_instruct_with_requirement(session):
     response = session.instruct(
         "Write an email to Hendrik convincing him to buy some self-sealing stembolts."
@@ -45,12 +51,14 @@ def test_instruct_with_requirement(session):
     )
     print(results)
 
+@pytest.mark.llm
 def test_chat(session):
     output_message = session.chat("What is 1+1?")
-    assert (
-        "2" in output_message.content
-    ), f"Expected a message with content containing 2 but found {output_message}"
+    assert "2" in output_message.content, (
+        f"Expected a message with content containing 2 but found {output_message}"
+    )
 
+@pytest.mark.llm
 def test_format(session):
     class Person(pydantic.BaseModel):
         name: str
@@ -83,6 +91,7 @@ class Email(pydantic.BaseModel):
     # assert email.to.email_address.endswith("example.com")
     pass
 
+@pytest.mark.llm
 def test_generate_from_raw(session):
     prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"]
 
@@ -113,9 +122,9 @@ class Answer(pydantic.BaseModel):
     try:
         answer = Answer.model_validate_json(random_result.value)
     except pydantic.ValidationError as e:
-        assert (
-            False
-        ), f"formatting directive failed for {random_result.value}: {e.json()}"
+        assert False, (
+            f"formatting directive failed for {random_result.value}: {e.json()}"
+        )
 
 
 if __name__ == "__main__":
diff --git a/test/backends/test_openai_ollama.py b/test/backends/test_openai_ollama.py
@@ -1,41 +1,52 @@
 # test/rits_backend_tests/test_openai_integration.py
+import pydantic
+import pytest
+from typing_extensions import Annotated
+
 from mellea import MelleaSession
-from mellea.stdlib.base import CBlock, LinearContext, ModelOutputThunk
-from mellea.backends.openai import OpenAIBackend
 from mellea.backends.formatter import TemplateFormatter
+from mellea.backends.model_ids import META_LLAMA_3_2_1B
+from mellea.backends.openai import OpenAIBackend
 from mellea.backends.types import ModelOption
-
-import pydantic
-from typing_extensions import Annotated
-import pytest
+from mellea.stdlib.base import CBlock, LinearContext, ModelOutputThunk
 
 
 @pytest.fixture(scope="module")
-def backend():
+def backend(gh_run: int):
     """Shared OpenAI backend configured for Ollama."""
-    return OpenAIBackend(
-        model_id="granite3.3:8b",
-        formatter=TemplateFormatter(model_id="ibm-granite/granite-3.2-8b-instruct"),
+    if gh_run == 1:
+        return OpenAIBackend(
+        model_id=META_LLAMA_3_2_1B,
+        formatter=TemplateFormatter(model_id=META_LLAMA_3_2_1B),
         base_url="http://localhost:11434/v1",
         api_key="ollama",
     )
+    else:
+        return OpenAIBackend(
+            model_id="granite3.3:8b",
+            formatter=TemplateFormatter(model_id="ibm-granite/granite-3.2-8b-instruct"),
+            base_url="http://localhost:11434/v1",
+            api_key="ollama",
+        )
 
 
 @pytest.fixture(scope="function")
-def session(backend):
+def m_session(backend):
     """Fresh OpenAI session for each test."""
     session = MelleaSession(backend, ctx=LinearContext(is_chat_context=True))
     yield session
     session.reset()
 
-def test_instruct(session):
-    result = session.instruct("Compute 1+1.")
+@pytest.mark.llm
+def test_instruct(m_session):
+    result = m_session.instruct("Compute 1+1.")
     assert isinstance(result, ModelOutputThunk)
     assert "2" in result.value  # type: ignore
 
-def test_multiturn(session):
-    session.instruct("What is the capital of France?")
-    answer = session.instruct("Tell me the answer to the previous question.")
+@pytest.mark.llm
+def test_multiturn(m_session):
+    m_session.instruct("What is the capital of France?")
+    answer = m_session.instruct("Tell me the answer to the previous question.")
     assert "Paris" in answer.value  # type: ignore
 
     # def test_api_timeout_error(self):
@@ -53,7 +64,8 @@ def test_multiturn(session):
     #     assert "granite3.3:8b" in result.value
     #     self.m.reset()
 
-def test_format(session):
+@pytest.mark.llm
+def test_format(m_session):
     class Person(pydantic.BaseModel):
         name: str
         # it does not support regex patterns in json schema
@@ -68,7 +80,7 @@ class Email(pydantic.BaseModel):
         subject: str
         body: str
 
-    output = session.instruct(
+    output = m_session.instruct(
         "Write a short email to Olivia, thanking her for organizing a sailing activity. Her email server is example.com. No more than two sentences. ",
         format=Email,
         model_options={ModelOption.MAX_NEW_TOKENS: 2**8},
diff --git a/test/backends/test_watsonx.py b/test/backends/test_watsonx.py
@@ -1,14 +1,15 @@
 # test/rits_backend_tests/test_watsonx_integration.py
 import os
-from mellea import MelleaSession
-from mellea.stdlib.base import CBlock, LinearContext, ModelOutputThunk
-from mellea.backends.watsonx import WatsonxAIBackend
-from mellea.backends.formatter import TemplateFormatter
-from mellea.backends.types import ModelOption
 
 import pydantic
-from typing_extensions import Annotated
 import pytest
+from typing_extensions import Annotated
+
+from mellea import MelleaSession
+from mellea.backends.formatter import TemplateFormatter
+from mellea.backends.types import ModelOption
+from mellea.backends.watsonx import WatsonxAIBackend
+from mellea.stdlib.base import CBlock, LinearContext, ModelOutputThunk
 
 
 @pytest.fixture(scope="module")
@@ -28,18 +29,21 @@ def session(backend):
     session.reset()
 
 
-
-
+@pytest.mark.llm
 def test_instruct(session):
     result = session.instruct("Compute 1+1.")
     assert isinstance(result, ModelOutputThunk)
     assert "2" in result.value  # type: ignore
 
+
+@pytest.mark.llm
 def test_multiturn(session):
     session.instruct("What is the capital of France?")
     answer = session.instruct("Tell me the answer to the previous question.")
     assert "Paris" in answer.value  # type: ignore
 
+
+@pytest.mark.llm
 def test_format(session):
     class Person(pydantic.BaseModel):
         name: str
@@ -72,6 +76,8 @@ class Email(pydantic.BaseModel):
     # assert email.to.email_address.endswith("example.com")
     pass
 
+
+@pytest.mark.llm
 def test_generate_from_raw(session):
     prompts = ["what is 1+1?", "what is 2+2?", "what is 3+3?", "what is 4+4?"]
 
diff --git a/test/conftest.py b/test/conftest.py
@@ -21,7 +21,7 @@ def pytest_runtest_setup(item):
     gh_run = int(os.environ.get("GITHUB_ACTION", 0))
 
     if gh_run == 1:
-        pytest.skip(
+        pytest.xfail(
             reason="Skipping LLM test: got env variable GITHUB_ACTION == 1. Used only in gh workflows."
         )
 
diff --git a/test/stdlib_basics/test_base.py b/test/stdlib_basics/test_base.py
@@ -1,5 +1,4 @@
-from mellea.stdlib.base import Component, CBlock
-from mellea.stdlib.base import LinearContext
+from mellea.stdlib.base import CBlock, Component, LinearContext
 
 
 def test_cblock():
diff --git a/test/stdlib_basics/test_chat_view.py b/test/stdlib_basics/test_chat_view.py
@@ -1,7 +1,8 @@
 
 import pytest
-from mellea.stdlib.base import ModelOutputThunk, LinearContext
-from mellea.stdlib.chat import as_chat_history, Message
+
+from mellea.stdlib.base import LinearContext, ModelOutputThunk
+from mellea.stdlib.chat import Message, as_chat_history
 from mellea.stdlib.session import start_session
 
 
diff --git a/test/stdlib_basics/test_contextual_session.py b/test/stdlib_basics/test_contextual_session.py
diff --git a/test/stdlib_basics/test_genslot.py b/test/stdlib_basics/test_genslot.py
diff --git a/test/stdlib_basics/test_session.py b/test/stdlib_basics/test_session.py

Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,7 @@ def pytest_runtest_setup(item):`
`21`	`21`	`gh_run = int(os.environ.get("GITHUB_ACTION", 0))`
`22`	`22`
`23`	`23`	`if gh_run == 1:`
`24`		`- pytest.skip(`
	`24`	`+ pytest.xfail(`
`25`	`25`	`reason="Skipping LLM test: got env variable GITHUB_ACTION == 1. Used only in gh workflows."`
`26`	`26`	`)`
`27`	`27`