Merge pull request #153 from ks6088ts-labs/feature/issue-152_mlflow-evaluation

ks6088ts · web-flow · commit 6571fc42efdc · 2025-09-23T07:03:09.000+09:00
add MLflow operator CLI to demonstrate tracing and evaluation
diff --git a/Makefile b/Makefile
@@ -173,7 +173,10 @@ create-elasticsearch-index: ## create Elasticsearch index
 # ---
 # Project / Run agents
 # ---
+
 QUESTION ?= "KABUTOの起動時に、画面全体が紫色に点滅し、システムがフリーズします。KABUTO のマニュアルから、関連する情報を取得したり過去のシステムのトラブルシュート事例が蓄積されたデータベースから、関連する情報を取得して質問に答えてください"
+PORT ?= 5001
+
 .PHONY: run-chat-with-tools-agent
 run-chat-with-tools-agent: ## run chat with tools agent
 	uv run python scripts/agent_operator.py run \
@@ -227,4 +230,4 @@ mlflow: ## run MLflow
 	uv run mlflow server \
 		--backend-store-uri sqlite:///mlflow.db \
 		--host 0.0.0.0 \
-		--port 5000
+		--port $(PORT)
diff --git a/docs/references.md b/docs/references.md
@@ -90,4 +90,14 @@
 - [DSPy (Declarative Self-improving Python)](https://dspy.ai/)
 - [Language Models](https://dspy.ai/learn/programming/language_models/)
 - [Language Models / v3.0.3](https://github.com/stanfordnlp/dspy/blob/3.0.3/docs/docs/learn/programming/language_models.md)
-- [Software Design誌「実践LLMアプリケーション開発」第25回サンプルコード](https://github.com/mahm/softwaredesign-llm-application/tree/main/25)
+- [Software Design 誌「実践 LLM アプリケーション開発」第 25 回サンプルコード](https://github.com/mahm/softwaredesign-llm-application/tree/main/25)
+
+### [MLflow](https://mlflow.org/docs/latest/genai/)
+
+- [LangChain / MLflow](https://docs.langchain.com/oss/python/integrations/providers/mlflow_tracking)
+- [MLflow / Tracing LangGraph🦜🕸️](https://mlflow.org/docs/latest/genai/tracing/integrations/listing/langgraph/)
+- [GenAI Evaluation Quickstart](https://mlflow.org/docs/latest/genai/eval-monitor/quickstart/)
+
+### [LiteLLM](https://docs.litellm.ai/)
+
+- [Azure OpenAI](https://docs.litellm.ai/docs/providers/azure/)
diff --git a/scripts/dspy_operator.py b/scripts/dspy_operator.py
@@ -76,7 +76,7 @@ def optimize_with_miprov2(trainset, eval_lm, chat_lm):
     """MIPROv2を使用してチャットボットを最適化"""
 
     # MLflowの設定
-    MLFLOW_PORT = os.getenv("MLFLOW_PORT", "5000")
+    MLFLOW_PORT = os.getenv("MLFLOW_PORT", "5001")
     MLFLOW_TRACKING_URI = f"http://localhost:{MLFLOW_PORT}"
     MLFLOW_EXPERIMENT_NAME = "DSPy-EdamameFairy-Optimization"
     MLFLOW_RUN_NAME = "miprov2_optimization"
diff --git a/scripts/mlflow_operator.py b/scripts/mlflow_operator.py
@@ -0,0 +1,171 @@
+import logging
+import os
+from logging import basicConfig
+
+import mlflow
+import typer
+from dotenv import load_dotenv
+from langchain_core.messages import HumanMessage, SystemMessage
+from mlflow.genai import scorer
+from mlflow.genai.scorers import Correctness, Guidelines
+
+from template_langgraph.agents.demo_agents.weather_agent import graph
+from template_langgraph.llms.azure_openais import AzureOpenAiWrapper, Settings
+from template_langgraph.loggers import get_logger
+
+app = typer.Typer(
+    add_completion=False,
+    help="MLflow operator CLI",
+)
+logger = get_logger(__name__)
+
+
+def set_verbose_logging(verbose: bool):
+    if verbose:
+        logger.setLevel(logging.DEBUG)
+        basicConfig(level=logging.DEBUG)
+
+
+@app.command(
+    help="Run the LangGraph agent with MLflow tracing ref. https://mlflow.org/docs/2.21.3/tracing/integrations/langgraph"
+)
+def tracing(
+    query: str = typer.Option(
+        "What is the weather like in Japan?",
+        "--query",
+        "-q",
+        help="Query to run with the LangGraph agent",
+    ),
+    experiment_name: str = typer.Option(
+        "LangGraph Experiment",
+        "--experiment-name",
+        "-e",
+        help="MLflow experiment name",
+    ),
+    tracking_uri: str = typer.Option(
+        "http://localhost:5001",
+        "--tracking-uri",
+        "-t",
+        help="MLflow tracking URI",
+    ),
+    verbose: bool = typer.Option(
+        True,
+        "--verbose",
+        "-v",
+        help="Enable verbose output",
+    ),
+):
+    set_verbose_logging(verbose)
+    logger.info("Running...")
+
+    mlflow.langchain.autolog()
+    mlflow.set_tracking_uri(tracking_uri)
+    mlflow.set_experiment(experiment_name)
+
+    result = graph.invoke(
+        {
+            "messages": [
+                HumanMessage(content=query),
+            ]
+        },
+    )
+    logger.info(f"Result: {result}")
+
+    # Get the trace object just created
+    trace = mlflow.get_trace(
+        trace_id=mlflow.get_last_active_trace_id(),
+    )
+    logger.info(f"Trace info: {trace.info.token_usage}")
+
+
+@app.command(
+    help="Evaluate the LangGraph agent with MLflow tracing ref. https://mlflow.org/docs/latest/genai/eval-monitor/quickstart/"
+)
+def evaluate(
+    experiment_name: str = typer.Option(
+        "LangGraph Experiment",
+        "--experiment-name",
+        "-e",
+        help="MLflow experiment name",
+    ),
+    tracking_uri: str = typer.Option(
+        "http://localhost:5001",
+        "--tracking-uri",
+        "-t",
+        help="MLflow tracking URI",
+    ),
+    verbose: bool = typer.Option(
+        True,
+        "--verbose",
+        "-v",
+        help="Enable verbose output",
+    ),
+):
+    set_verbose_logging(verbose)
+    logger.info("Running...")
+
+    mlflow.langchain.autolog()
+    mlflow.set_tracking_uri(tracking_uri)
+    mlflow.set_experiment(experiment_name)
+
+    llm = AzureOpenAiWrapper().chat_model
+
+    def qa_predict_fn(question: str) -> str:
+        """Simple Q&A prediction function using OpenAI"""
+        response = llm.invoke(
+            [
+                SystemMessage(content="You are a helpful assistant. Answer questions concisely."),
+                HumanMessage(content=question),
+            ]
+        )
+        return response.content.__str__()
+
+    @scorer
+    def is_concise(outputs: str) -> bool:
+        """Evaluate if the answer is concise (less than 5 words)"""
+        return len(outputs.split()) <= 5
+
+    # To configure LiteLLM for Azure OpenAI ref. https://docs.litellm.ai/docs/providers/azure/
+    settings = Settings()
+
+    os.environ["AZURE_API_KEY"] = settings.azure_openai_api_key
+    os.environ["AZURE_API_BASE"] = settings.azure_openai_endpoint
+    os.environ["AZURE_API_VERSION"] = settings.azure_openai_api_version
+    os.environ["AZURE_API_TYPE"] = "azure"
+
+    model = f"azure:/{settings.azure_openai_model_chat}"
+    results = mlflow.genai.evaluate(
+        data=[
+            {
+                "inputs": {"question": "What is the capital of France?"},
+                "expectations": {"expected_response": "Paris"},
+            },
+            {
+                "inputs": {"question": "Who was the first person to build an airplane?"},
+                "expectations": {"expected_response": "Wright Brothers"},
+            },
+            {
+                "inputs": {"question": "Who wrote Romeo and Juliet?"},
+                "expectations": {"expected_response": "William Shakespeare"},
+            },
+        ],
+        predict_fn=qa_predict_fn,
+        scorers=[
+            Correctness(model=model),
+            Guidelines(
+                model=model,
+                name="is_english",
+                guidelines="The answer must be in English",
+            ),
+            is_concise,
+        ],
+    )
+    logger.info(f"Evaluation results: {results}")
+
+
+if __name__ == "__main__":
+    load_dotenv(
+        override=True,
+        verbose=True,
+    )
+    app()
diff --git a/uv.lock b/uv.lock