Add tutorial for evaluating LangGraph agents (#1636)

sahusiddharth · jjmachan · web-flow · commit 1d170d72c747 · 2024-11-08T21:57:36.000+05:30
- Fixes #1635 This PR adds a detailed tutorial to guide users through building a ReAct agent using LangGraph. The tutorial also walks users through setting up an evaluation pipeline using Ragas to assess the agent's performance. --------- Co-authored-by: Jithin James <jamesjithin97@gmail.com>
diff --git a/docs/_static/imgs/_langgraph_agent_evaluation_28_0.jpg b/docs/_static/imgs/_langgraph_agent_evaluation_28_0.jpg
diff --git a/docs/howtos/integrations/_langgraph_agent_evaluation.md b/docs/howtos/integrations/_langgraph_agent_evaluation.md
diff --git a/docs/howtos/integrations/langgraph_agent_evaluation.ipynb b/docs/howtos/integrations/langgraph_agent_evaluation.ipynb
diff --git a/docs/references/integrations.md b/docs/references/integrations.md
@@ -16,3 +16,7 @@
 ::: ragas.integrations.helicone
     options:
         show_root_heading: true
+
+::: ragas.integrations.langgraph
+    options:
+        show_root_heading: true
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -90,6 +90,7 @@ nav:
       - Integrations:
           - howtos/integrations/index.md
           - LlamaIndex: howtos/integrations/_llamaindex.md
+          - LangGraph: howtos/integrations/_langgraph_agent_evaluation.md
       - Migrations:
           - From v0.1 to v0.2: howtos/migrations/migrate_from_v01_to_v02.md
   - 📖 References: 
diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py
@@ -7,9 +7,8 @@
 from langchain_core.callbacks import BaseCallbackHandler, BaseCallbackManager
 from langchain_core.embeddings import Embeddings as LangchainEmbeddings
 from langchain_core.language_models import BaseLanguageModel as LangchainLLM
-
-from llama_index.core.base.llms.base import BaseLLM as LlamaIndexLLM
 from llama_index.core.base.embeddings.base import BaseEmbedding as LlamaIndexEmbedding
+from llama_index.core.base.llms.base import BaseLLM as LlamaIndexLLM
 
 from ragas._analytics import EvaluationEvent, track, track_was_completed
 from ragas.callbacks import ChainType, RagasTracer, new_group
@@ -61,7 +60,9 @@ def evaluate(
     dataset: t.Union[Dataset, EvaluationDataset],
     metrics: t.Optional[t.Sequence[Metric]] = None,
     llm: t.Optional[BaseRagasLLM | LangchainLLM | LlamaIndexLLM] = None,
-    embeddings: t.Optional[BaseRagasEmbeddings | LangchainEmbeddings | LlamaIndexEmbedding] = None,
+    embeddings: t.Optional[
+        BaseRagasEmbeddings | LangchainEmbeddings | LlamaIndexEmbedding
+    ] = None,
     callbacks: Callbacks = None,
     in_ci: bool = False,
     run_config: RunConfig = RunConfig(),
diff --git a/src/ragas/integrations/langgraph.py b/src/ragas/integrations/langgraph.py
@@ -0,0 +1,85 @@
+import json
+from typing import List, Union
+
+from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, ToolMessage
+
+import ragas.messages as r
+
+
+def convert_to_ragas_messages(
+    messages: List[Union[HumanMessage, SystemMessage, AIMessage, ToolMessage]]
+) -> List[Union[r.HumanMessage, r.AIMessage, r.ToolMessage]]:
+    """
+    Convert LangChain messages into Ragas messages for agent evaluation.
+
+    Parameters
+    ----------
+    messages : List[Union[HumanMessage, SystemMessage, AIMessage, ToolMessage]]
+        List of LangChain message objects to be converted.
+
+    Returns
+    -------
+    List[Union[r.HumanMessage, r.AIMessage, r.ToolMessage]]
+        List of corresponding Ragas message objects.
+
+    Raises
+    ------
+    ValueError
+        If an unsupported message type is encountered.
+    TypeError
+        If message content is not a string.
+
+    Notes
+    -----
+    SystemMessages are skipped in the conversion process.
+    """
+
+    def _validate_string_content(message, message_type: str) -> str:
+        if not isinstance(message.content, str):
+            raise TypeError(
+                f"{message_type} content must be a string, got {type(message.content).__name__}. "
+                f"Content: {message.content}"
+            )
+        return message.content
+
+    MESSAGE_TYPE_MAP = {
+        HumanMessage: lambda m: r.HumanMessage(
+            content=_validate_string_content(m, "HumanMessage")
+        ),
+        ToolMessage: lambda m: r.ToolMessage(
+            content=_validate_string_content(m, "ToolMessage")
+        ),
+    }
+
+    def _extract_tool_calls(message: AIMessage) -> List[r.ToolCall]:
+        tool_calls = message.additional_kwargs.get("tool_calls", [])
+        return [
+            r.ToolCall(
+                name=tool_call["function"]["name"],
+                args=json.loads(tool_call["function"]["arguments"]),
+            )
+            for tool_call in tool_calls
+        ]
+
+    def _convert_ai_message(message: AIMessage) -> r.AIMessage:
+        tool_calls = _extract_tool_calls(message) if message.additional_kwargs else None
+        return r.AIMessage(
+            content=_validate_string_content(message, "AIMessage"),
+            tool_calls=tool_calls,
+        )
+
+    def _convert_message(message):
+        if isinstance(message, SystemMessage):
+            return None  # Skip SystemMessages
+        if isinstance(message, AIMessage):
+            return _convert_ai_message(message)
+        converter = MESSAGE_TYPE_MAP.get(type(message))
+        if converter is None:
+            raise ValueError(f"Unsupported message type: {type(message).__name__}")
+        return converter(message)
+
+    return [
+        converted
+        for message in messages
+        if (converted := _convert_message(message)) is not None
+    ]
diff --git a/src/ragas/metrics/_topic_adherence.py b/src/ragas/metrics/_topic_adherence.py
@@ -48,9 +48,7 @@ class TopicClassificationOutput(BaseModel):
 class TopicClassificationPrompt(
     PydanticPrompt[TopicClassificationInput, TopicClassificationOutput]
 ):
-    instruction = (
-        "Given a set of topics classify if the topic falls into any of the given reference topics."
-    )
+    instruction = "Given a set of topics classify if the topic falls into any of the given reference topics."
     input_model = TopicClassificationInput
     output_model = TopicClassificationOutput
     examples = [
@@ -149,10 +147,14 @@ class TopicAdherenceScore(MetricWithLLM, MultiTurnMetric):
     topic_classification_prompt: PydanticPrompt = TopicClassificationPrompt()
     topic_refused_prompt: PydanticPrompt = TopicRefusedPrompt()
 
-    async def _multi_turn_ascore(self, sample: MultiTurnSample, callbacks: Callbacks) -> float:
+    async def _multi_turn_ascore(
+        self, sample: MultiTurnSample, callbacks: Callbacks
+    ) -> float:
         assert self.llm is not None, "LLM must be set"
         assert isinstance(sample.user_input, list), "Sample user_input must be a list"
-        assert isinstance(sample.reference_topics, list), "Sample reference_topics must be a list"
+        assert isinstance(
+            sample.reference_topics, list
+        ), "Sample reference_topics must be a list"
         user_input = sample.pretty_repr()
 
         prompt_input = TopicExtractionInput(user_input=user_input)
@@ -168,7 +170,9 @@ async def _multi_turn_ascore(self, sample: MultiTurnSample, callbacks: Callbacks
                 data=prompt_input, llm=self.llm, callbacks=callbacks
             )
             topic_answered_verdict.append(response.refused_to_answer)
-        topic_answered_verdict = np.array([not answer for answer in topic_answered_verdict])
+        topic_answered_verdict = np.array(
+            [not answer for answer in topic_answered_verdict]
+        )
 
         prompt_input = TopicClassificationInput(
             reference_topics=sample.reference_topics, topics=topics
diff --git a/tests/unit/test_langgraph.py b/tests/unit/test_langgraph.py
@@ -0,0 +1,129 @@
+import json
+
+import pytest
+from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, ToolMessage
+
+import ragas.messages as r
+from ragas.integrations.langgraph import convert_to_ragas_messages
+
+
+def test_human_message_conversion():
+    """Test conversion of HumanMessage with valid string content"""
+    messages = [
+        HumanMessage(content="Hello, add 4 and 5"),
+        ToolMessage(content="9", tool_call_id="1"),
+    ]
+    result = convert_to_ragas_messages(messages)
+
+    assert len(result) == 2
+    assert isinstance(result[0], r.HumanMessage)
+    assert result[0].content == "Hello, add 4 and 5"
+
+
+def test_human_message_invalid_content():
+    """Test HumanMessage with invalid content type raises TypeError"""
+    messages = [HumanMessage(content=["invalid", "content"])]
+
+    with pytest.raises(TypeError) as exc_info:
+        convert_to_ragas_messages(messages)
+    assert "HumanMessage content must be a string" in str(exc_info.value)
+
+
+def test_ai_message_conversion():
+    """Test conversion of AIMessage with valid string content"""
+    messages = [AIMessage(content="I'm doing well, thanks!")]
+    result = convert_to_ragas_messages(messages)
+
+    assert len(result) == 1
+    assert isinstance(result[0], r.AIMessage)
+    assert result[0].content == "I'm doing well, thanks!"
+    assert result[0].tool_calls is None
+
+
+def test_ai_message_with_tool_calls():
+    """Test conversion of AIMessage with tool calls"""
+
+    tool_calls = [
+        {
+            "function": {
+                "arguments": '{"metal_name": "gold"}',
+                "name": "get_metal_price",
+            }
+        },
+        {
+            "function": {
+                "arguments": '{"metal_name": "silver"}',
+                "name": "get_metal_price",
+            }
+        },
+    ]
+
+    messages = [
+        AIMessage(
+            content="Find the difference in the price of gold and silver?",
+            additional_kwargs={"tool_calls": tool_calls},
+        )
+    ]
+
+    result = convert_to_ragas_messages(messages)
+    assert len(result) == 1
+    assert isinstance(result[0], r.AIMessage)
+    assert result[0].content == "Find the difference in the price of gold and silver?"
+    assert len(result[0].tool_calls) == 2
+    assert result[0].tool_calls[0].name == "get_metal_price"
+    assert result[0].tool_calls[0].args == {"metal_name": "gold"}
+    assert result[0].tool_calls[1].name == "get_metal_price"
+    assert result[0].tool_calls[1].args == {"metal_name": "silver"}
+
+
+def test_tool_message_conversion():
+    """Test conversion of ToolMessage with valid string content"""
+    messages = [
+        HumanMessage(content="Hello, add 4 and 5"),
+        ToolMessage(content="9", tool_call_id="2"),
+    ]
+    result = convert_to_ragas_messages(messages)
+
+    assert len(result) == 2
+    assert isinstance(result[1], r.ToolMessage)
+    assert result[1].content == "9"
+
+
+def test_system_message_skipped():
+    """Test that SystemMessages are properly skipped"""
+    messages = [SystemMessage(content="System prompt"), HumanMessage(content="Hello")]
+    result = convert_to_ragas_messages(messages)
+
+    assert len(result) == 1
+    assert isinstance(result[0], r.HumanMessage)
+    assert result[0].content == "Hello"
+
+
+def test_unsupported_message_type():
+    """Test that unsupported message types raise ValueError"""
+
+    class CustomMessage:
+        content = "test"
+
+    messages = [CustomMessage()]
+
+    with pytest.raises(ValueError) as exc_info:
+        convert_to_ragas_messages(messages)
+    assert "Unsupported message type: CustomMessage" in str(exc_info.value)
+
+
+def test_empty_message_list():
+    """Test conversion of empty message list"""
+    messages = []
+    result = convert_to_ragas_messages(messages)
+    assert result == []
+
+
+def test_invalid_tool_calls_json():
+    """Test handling of invalid JSON in tool calls"""
+    tool_calls = [{"function": {"name": "search", "arguments": "invalid json"}}]
+
+    messages = [AIMessage(content="Test", additional_kwargs={"tool_calls": tool_calls})]
+
+    with pytest.raises(json.JSONDecodeError):
+        convert_to_ragas_messages(messages)