diff --git a/docs/examples/intrinsics/answer_relevance.py b/docs/examples/intrinsics/answer_relevance.py index 1be2b368..d0e7bbd1 100644 --- a/docs/examples/intrinsics/answer_relevance.py +++ b/docs/examples/intrinsics/answer_relevance.py @@ -13,7 +13,7 @@ from mellea.stdlib.intrinsics import rag -backend = LocalHFBackend(model_id="ibm-granite/granite-3.3-2b-instruct") +backend = LocalHFBackend(model_id="ibm-granite/granite-4.0-micro") context = ChatContext().add(Message("user", "Who attended the meeting?")) documents = [ Document("Meeting attendees: Alice, Bob, Carol."), diff --git a/docs/examples/intrinsics/answerability.py b/docs/examples/intrinsics/answerability.py index a3d20bc4..ce9ff069 100644 --- a/docs/examples/intrinsics/answerability.py +++ b/docs/examples/intrinsics/answerability.py @@ -13,7 +13,7 @@ from mellea.stdlib.intrinsics import rag -backend = LocalHFBackend(model_id="ibm-granite/granite-3.3-2b-instruct") +backend = LocalHFBackend(model_id="ibm-granite/granite-4.0-micro") context = ChatContext().add(Message("assistant", "Hello there, how can I help you?")) next_user_turn = "What is the square root of 4?" documents_answerable = [Document("The square root of 4 is 2.")] diff --git a/docs/examples/intrinsics/citations.py b/docs/examples/intrinsics/citations.py index aa27ed00..09fe7724 100644 --- a/docs/examples/intrinsics/citations.py +++ b/docs/examples/intrinsics/citations.py @@ -14,7 +14,7 @@ import json -backend = LocalHFBackend(model_id="ibm-granite/granite-3.3-2b-instruct") +backend = LocalHFBackend(model_id="ibm-granite/granite-4.0-micro") context = ChatContext().add( Message( "user", diff --git a/docs/examples/intrinsics/context_relevance.py b/docs/examples/intrinsics/context_relevance.py index dcf24062..ff6c985d 100644 --- a/docs/examples/intrinsics/context_relevance.py +++ b/docs/examples/intrinsics/context_relevance.py @@ -13,7 +13,7 @@ from mellea.stdlib.intrinsics import rag -backend = LocalHFBackend(model_id="ibm-granite/granite-3.3-2b-instruct") +backend = LocalHFBackend(model_id="ibm-granite/granite-4.0-micro") context = ChatContext() question = "Who is the CEO of Microsoft?" document = Document( @@ -28,4 +28,4 @@ ) result = rag.check_context_relevance(question, document, context, backend) -print(f"Result of context relevance check: {result}") +print(f"Result of context relevance check with irrelevant document: {result}") diff --git a/docs/examples/intrinsics/hallucination_detection.py b/docs/examples/intrinsics/hallucination_detection.py index 74ab966e..ed1838d7 100644 --- a/docs/examples/intrinsics/hallucination_detection.py +++ b/docs/examples/intrinsics/hallucination_detection.py @@ -14,7 +14,7 @@ import json -backend = LocalHFBackend(model_id="ibm-granite/granite-3.3-2b-instruct") +backend = LocalHFBackend(model_id="ibm-granite/granite-4.0-micro") context = ( ChatContext() .add(Message("assistant", "Hello there, how can I help you?")) diff --git a/docs/examples/intrinsics/query_rewrite.py b/docs/examples/intrinsics/query_rewrite.py index e0ef32a1..39aabac1 100644 --- a/docs/examples/intrinsics/query_rewrite.py +++ b/docs/examples/intrinsics/query_rewrite.py @@ -13,7 +13,7 @@ from mellea.stdlib.intrinsics import rag -backend = LocalHFBackend(model_id="ibm-granite/granite-3.3-2b-instruct") +backend = LocalHFBackend(model_id="ibm-granite/granite-4.0-micro") context = ( ChatContext() .add(Message("assistant", "Welcome to pet questions!")) diff --git a/mellea/backends/adapters/catalog.py b/mellea/backends/adapters/catalog.py index 0ef73219..4530516b 100644 --- a/mellea/backends/adapters/catalog.py +++ b/mellea/backends/adapters/catalog.py @@ -38,15 +38,16 @@ class IntriniscsCatalogEntry(pydantic.BaseModel): ) -_RAG_REPO = "ibm-granite/rag-intrinsics-lib" +_RAG_REPO = "ibm-granite/granite-lib-rag-r1.0" +_CORE_REPO = "ibm-granite/rag-intrinsics-lib" _INTRINSICS_CATALOG_ENTRIES = [ ############################################ # Core Intrinsics ############################################ - IntriniscsCatalogEntry(name="requirement_check", repo_id=_RAG_REPO), - IntriniscsCatalogEntry(name="uncertainty", repo_id=_RAG_REPO), + IntriniscsCatalogEntry(name="requirement_check", repo_id=_CORE_REPO), + IntriniscsCatalogEntry(name="uncertainty", repo_id=_CORE_REPO), ############################################ # RAG Intrinsics ############################################ diff --git a/test/stdlib_intrinsics/test_rag/test_rag.py b/test/stdlib_intrinsics/test_rag/test_rag.py index 47b13e02..ea812d1c 100644 --- a/test/stdlib_intrinsics/test_rag/test_rag.py +++ b/test/stdlib_intrinsics/test_rag/test_rag.py @@ -17,7 +17,7 @@ """Location of data files for the tests in this file.""" -BASE_MODEL = "ibm-granite/granite-3.3-2b-instruct" +BASE_MODEL = "ibm-granite/granite-4.0-micro" @pytest.fixture(name="backend") @@ -82,11 +82,11 @@ def test_answerability(backend): # First call triggers adapter loading result = rag.check_answerability(next_user_turn, documents, context, backend) - assert pytest.approx(result) == 1.0 + assert pytest.approx(result, rel=0.01) == 1.0 # Second call hits a different code path from the first one result = rag.check_answerability(next_user_turn, documents, context, backend) - assert pytest.approx(result) == 1.0 + assert pytest.approx(result, rel=0.01) == 1.0 @pytest.mark.qualitative @@ -94,8 +94,7 @@ def test_query_rewrite(backend): """Verify that the answerability intrinsic functions properly.""" context, next_user_turn, _ = _read_input_json("query_rewrite.json") expected = ( - "Is Rex, the dog, more likely to get fleas because he spends a lot of " - "time outdoors?" + "Is Rex more likely to get fleas because he spends a lot of time outdoors?" ) # First call triggers adapter loading @@ -132,11 +131,11 @@ def test_context_relevance(backend): # First call triggers adapter loading result = rag.check_context_relevance(question, document, context, backend) - assert pytest.approx(result, abs=2e-2) == 0.45 + assert pytest.approx(result, abs=1e-2) == 0.0 # Second call hits a different code path from the first one result = rag.check_context_relevance(question, document, context, backend) - assert pytest.approx(result, abs=2e-2) == 0.45 + assert pytest.approx(result, abs=1e-2) == 0.0 @pytest.mark.qualitative @@ -165,9 +164,7 @@ def test_answer_relevance(backend): # Note that this is not the optimal answer. This test is currently using an # outdated LoRA adapter. Releases of new adapters will come after the Mellea # integration has stabilized. - expected_rewrite = ( - "The documents do not provide information about the attendees of the meeting." - ) + expected_rewrite = "The meeting attendees were Alice, Bob, and Carol." # First call triggers adapter loading result = rag.rewrite_answer_for_relevance(answer, docs, context, backend) diff --git a/test/stdlib_intrinsics/test_rag/testdata/output_json/hallucination_detection.json b/test/stdlib_intrinsics/test_rag/testdata/output_json/hallucination_detection.json index 7546817e..06e80be5 100644 --- a/test/stdlib_intrinsics/test_rag/testdata/output_json/hallucination_detection.json +++ b/test/stdlib_intrinsics/test_rag/testdata/output_json/hallucination_detection.json @@ -3,7 +3,7 @@ { "index": 0, "message": { - "content": "[{\"response_begin\": 0, \"response_end\": 36, \"response_text\": \"Purple bumble fish are yellow. \", \"faithfulness_likelihood\": 0.2062460112028628, \"explanation\": \"This sentence makes a factual claim about the color of fish. However, the provided document only mentions one type of fish that is yellow, which is the purple bumble fish. There is no information about green bumble fish in the document, so the claim about green bumble fish being yellow cannot be verified.\"}, {\"response_begin\": 36, \"response_end\": 70, \"response_text\": \"Green bumble fish are also yellow.\", \"faithfulness_likelihood\": 0.006380047389365753, \"explanation\": \"This sentence makes a factual claim about the color of fish. However, the provided document only mentions one type of fish that is yellow, which is the purple bumble fish. There is no information about green bumble fish in the document, so the claim about green bumble fish being yellow cannot be verified.\"}]", + "content": "[{\"response_begin\": 0, \"response_end\": 36, \"response_text\": \"Purple bumble fish are yellow. \", \"faithfulness_likelihood\": 0.7280598165124975, \"explanation\": \"This sentence makes a factual claim about the color of purple bumble fish. The document states 'The only type of fish that is yellow is the purple bumble fish.' This directly supports the claim in the sentence.\"}, {\"response_begin\": 36, \"response_end\": 70, \"response_text\": \"Green bumble fish are also yellow.\", \"faithfulness_likelihood\": 0.08656033472953338, \"explanation\": \"This sentence makes a factual claim about the color of green bumble fish. However, the document does not mention green bumble fish at all. Therefore, this claim cannot be verified from the provided context.\"}]", "role": "assistant" } }