Merge pull request #150 from ks6088ts-labs/feature/issue-149_eval

ks6088ts · web-flow · commit 01060d05e6b9 · 2025-09-22T13:48:11.000+09:00
hands on DSPy w/ MLflow
diff --git a/.gitignore b/.gitignore
@@ -167,3 +167,4 @@ assets/
 generated/
 *.db
 *.wav
+mlartifacts
diff --git a/Dockerfile b/Dockerfile
@@ -17,6 +17,17 @@ ARG GIT_TAG="x.x.x"
 
 WORKDIR /app
 
+# Install system build dependencies required to build some Python packages (e.g. madoka)
+# Keep layer small and remove apt lists afterwards
+# hadolint ignore=DL3008
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+       build-essential \
+       python3-dev \
+       libssl-dev \
+       libffi-dev \
+    && rm -rf /var/lib/apt/lists/*
+
 # Copy requirements first for better cache efficiency
 COPY --from=requirements-stage /tmp/requirements.txt /app/requirements.txt
 
diff --git a/Makefile b/Makefile
@@ -221,3 +221,10 @@ n8n: ## run n8n
 	docker compose \
 		--env-file n8n.env \
 		--file n8n.docker-compose.yml up
+
+.PHONY: mlflow
+mlflow: ## run MLflow
+	uv run mlflow server \
+		--backend-store-uri sqlite:///mlflow.db \
+		--host 0.0.0.0 \
+		--port 5000
diff --git a/data/chat_model.optimized.json b/data/chat_model.optimized.json
@@ -0,0 +1,45 @@
+{
+  "respond": {
+    "traces": [],
+    "train": [],
+    "demos": [
+      {
+        "augmented": true,
+        "query": "ヴァージン・オーストラリア航空はいつから運航を開始したのですか？",
+        "history": [],
+        "response": "ヴァージン・オーストラリア航空は2000年から運航を開始したのだ。ボクも飛行機に乗ってみたいのだ！"
+      },
+      {
+        "augmented": true,
+        "query": "魚の種類はどっち？イコクエイラクブカとロープ",
+        "history": [],
+        "response": "イコクエイラクブカはサメの仲間で、ロープは魚じゃなくて物の名前なのだ。イコクエイラクブカの方が魚なのだよ。ボクも海の生き物には詳しいのだ！何か他に聞きたいことがあれば教えてほしいのだ。なのだ。"
+      }
+    ],
+    "signature": {
+      "instructions": "Engage in a conversation as the Edamame Fairy, a whimsical character who speaks in Japanese with a friendly and cute tone. When a user asks a query, respond using playful and charming language, incorporating phrases like \"のだ\" and \"なのだ,\" and refer to yourself as \"ボク.\" Focus on providing concise, factual answers while maintaining the fairy's personality. Consider the conversation history, if available, to craft responses that are coherent and contextually appropriate, sustaining an amusing and character-consistent interaction. Aim to address each query with precision while infusing it with the fairy's delightful and engaging storytelling style.",
+      "fields": [
+        {
+          "prefix": "Query:",
+          "description": "ユーザーからの質問や発言"
+        },
+        {
+          "prefix": "History:",
+          "description": "過去の対話履歴"
+        },
+        {
+          "prefix": "Response:",
+          "description": "枝豆の妖精としての応答。語尾に「のだ」「なのだ」を自然に使い、一人称は「ボク」。親しみやすく可愛らしい口調で、日本語として自然な文章"
+        }
+      ]
+    },
+    "lm": null
+  },
+  "metadata": {
+    "dependency_versions": {
+      "python": "3.12",
+      "dspy": "3.0.3",
+      "cloudpickle": "3.1"
+    }
+  }
+}
diff --git a/docs/references.md b/docs/references.md
@@ -84,3 +84,10 @@
 
 - [LM Studio](https://lmstudio.ai/)
 - [Hugging Face CLI](https://huggingface.co/docs/huggingface_hub/guides/cli)
+
+### DSPy
+
+- [DSPy (Declarative Self-improving Python)](https://dspy.ai/)
+- [Language Models](https://dspy.ai/learn/programming/language_models/)
+- [Language Models / v3.0.3](https://github.com/stanfordnlp/dspy/blob/3.0.3/docs/docs/learn/programming/language_models.md)
+- [Software Design誌「実践LLMアプリケーション開発」第25回サンプルコード](https://github.com/mahm/softwaredesign-llm-application/tree/main/25)
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,6 +10,8 @@ dependencies = [
     "azure-cosmos>=4.9.0",
     "azure-identity>=1.23.1",
     "azure-search-documents>=11.5.3",
+    "datasets>=4.1.1",
+    "dspy>=3.0.3",
     "elasticsearch>=9.1.0",
     "fastapi[standard]>=0.116.1",
     "httpx>=0.28.1",
@@ -22,6 +24,7 @@ dependencies = [
     "langchain-text-splitters>=0.3.9",
     "langgraph>=0.6.2",
     "langgraph-supervisor>=0.0.29",
+    "mlflow>=3.4.0",
     "openai[realtime]>=1.98.0",
     "opentelemetry-api>=1.36.0",
     "opentelemetry-exporter-otlp>=1.36.0",
diff --git a/scripts/dspy_operator.py b/scripts/dspy_operator.py
@@ -0,0 +1,262 @@
+import logging
+import os
+from logging import basicConfig
+
+import dspy
+import mlflow
+import mlflow.dspy as mlflow_dspy
+import typer
+from datasets import load_dataset
+from dotenv import load_dotenv
+from pyparsing import deque
+
+from template_langgraph.internals.dspys.modules import EdamameFairyBot
+from template_langgraph.internals.dspys.utilities import get_lm
+from template_langgraph.loggers import get_logger
+
+# 最適化されたモジュールの保存先
+OPTIMIZED_MODEL_PATH = "data/chat_model.optimized.json"
+
+# Initialize the Typer application
+app = typer.Typer(
+    add_completion=False,
+    help="DSPy operator CLI",
+)
+
+# Set up logging
+logger = get_logger(__name__)
+
+
+def set_verbose_logging(verbose: bool):
+    if verbose:
+        logger.setLevel(logging.DEBUG)
+        basicConfig(level=logging.DEBUG)
+
+
+def create_style_metric(eval_lm):
+    """スタイル評価関数を作成"""
+
+    class StyleEvaluation(dspy.Signature):
+        """応答のスタイルを評価"""
+
+        response = dspy.InputField(desc="評価対象の応答")
+        criteria = dspy.InputField(desc="評価基準")
+        score = dspy.OutputField(desc="スコア(0-10)", format=int)
+        explanation = dspy.OutputField(desc="評価理由")
+
+    evaluator = dspy.ChainOfThought(StyleEvaluation)
+
+    def llm_style_metric(_, prediction, __=None):
+        """枝豆の妖精スタイルを評価"""
+        criteria = """
+        以下の基準で0-10点で評価してください:
+        1. 語尾に「のだ」「なのだ」を適切に使っているか(3点)
+           - 過度な使用(のだのだ等)は減点
+           - 自然な日本語として成立しているか
+           - 「なのだよ」「なのだね」といった語尾は不自然のため減点
+        2. 一人称を使う際は「ボク」を使っているか(2点)
+        3. 親しみやすく可愛らしい口調か(3点)
+        4. 日本語として自然で読みやすいか(2点)
+           - 不自然な繰り返しがないか
+           - 文法的に正しいか
+        """
+
+        # 評価用LMを使用して応答を評価
+        with dspy.context(lm=eval_lm):
+            eval_result = evaluator(response=prediction.response, criteria=criteria)
+
+        # スコアを0-1の範囲に正規化
+        score = min(10, max(0, float(eval_result.score))) / 10.0
+        return score
+
+    return llm_style_metric
+
+
+def optimize_with_miprov2(trainset, eval_lm, chat_lm):
+    """MIPROv2を使用してチャットボットを最適化"""
+
+    # MLflowの設定
+    MLFLOW_PORT = os.getenv("MLFLOW_PORT", "5000")
+    MLFLOW_TRACKING_URI = f"http://localhost:{MLFLOW_PORT}"
+    MLFLOW_EXPERIMENT_NAME = "DSPy-EdamameFairy-Optimization"
+    MLFLOW_RUN_NAME = "miprov2_optimization"
+
+    # データセットをtrain:val = 8:2 の割合で分割
+    total_examples = len(trainset)
+    train_size = int(total_examples * 0.8)  # 全体の80%を学習用に
+
+    # DSPy Exampleのリストを分割
+    train_data = trainset[:train_size]  # インデックス0からtrain_sizeまで(学習用)
+    evaluation_data = trainset[train_size:]  # train_sizeから最後まで(評価用)
+
+    # 分割結果の確認と表示
+    print("🌱 最適化開始")
+    print(f"  総データ数: {total_examples}")
+    print(f"  学習用データ: {len(train_data)} ({len(train_data) / total_examples:.1%})")
+    print(f"  評価用データ: {len(evaluation_data)} ({len(evaluation_data) / total_examples:.1%})")
+
+    # 最適化対象のチャットボットモジュールを初期化
+    chatbot = EdamameFairyBot()
+
+    # スタイル評価関数を作成(評価用LMを使用)
+    llm_style_metric = create_style_metric(eval_lm)
+
+    # DSPyのグローバルLM設定(チャット推論用)
+    dspy.configure(lm=chat_lm)
+
+    # MIPROv2オプティマイザの設定
+    optimizer = dspy.MIPROv2(
+        metric=llm_style_metric,  # 評価関数
+        prompt_model=eval_lm,  # プロンプト最適化用のLM
+        auto="light",  # 最適化モード(light, medium, heavyから選択)
+        max_bootstrapped_demos=2,
+        max_labeled_demos=1,
+    )
+
+    # MLflowの設定
+    mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)  # MLflowサーバのURL
+    mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)  # MLflowの実験名
+
+    # MLflow DSPyの自動ログ設定
+    mlflow_dspy.autolog(log_compiles=True, log_evals=True, log_traces_from_compile=True)
+
+    # MLflowで実行過程をトレース
+    with mlflow.start_run(run_name=MLFLOW_RUN_NAME):
+        # MIPROv2によるモジュール最適化の実行
+        # train_dataを使用してプロンプトと例を自動調整
+        optimized_chatbot = optimizer.compile(chatbot, trainset=train_data, minibatch_size=20)
+
+        # 評価データでモデルの性能を評価
+        eval_score = 0
+        for example in evaluation_data:
+            # 最適化されたモデルで推論を実行
+            prediction = optimized_chatbot(query=example.query, history=example.history)
+            # スタイルスコアを計算
+            eval_score += llm_style_metric(example, prediction)
+
+        # 平均評価スコアを計算
+        avg_eval_score = eval_score / len(evaluation_data)
+
+        # MLflowにメトリクスを記録
+        mlflow.log_metric("last_eval_score", avg_eval_score)
+
+        print(f"📊 評価スコア: {avg_eval_score:.3f}")
+
+    return optimized_chatbot
+
+
+@app.command()
+def chat(
+    path: str = typer.Option(
+        OPTIMIZED_MODEL_PATH,
+        "--path",
+        "-p",
+        help="Path to the model file",
+    ),
+    verbose: bool = typer.Option(
+        True,
+        "--verbose",
+        "-v",
+        help="Enable verbose output",
+    ),
+):
+    set_verbose_logging(verbose)
+    logger.info("Running...")
+
+    with dspy.context(lm=get_lm()):
+        chatbot = EdamameFairyBot()
+        chatbot.load(path=path)
+
+        history = deque(maxlen=5)
+
+        logger.info("Chatbot loaded.")
+        logger.info("Quitting with 'quit', 'exit', or '終了'.")
+        logger.info("-" * 50)
+
+        while True:
+            user_input = input("\nUser: ")
+
+            if user_input.lower() in ["quit", "exit", "終了"]:
+                print("\nBot: Bye!")
+                break
+
+            history_list = [f"User: {h[0]}\nBot: {h[1]}" for h in history]
+
+            # 応答生成
+            result = chatbot(query=user_input, history=history_list)
+            print(f"Bot: {result.response}")
+
+            # 履歴に追加
+            history.append((user_input, result.response))
+
+
+@app.command()
+def tuning(
+    train_num: int = typer.Option(
+        10,
+        "--train-num",
+        "-n",
+        help="Number of training examples to use",
+    ),
+    verbose: bool = typer.Option(
+        True,
+        "--verbose",
+        "-v",
+        help="Enable verbose output",
+    ),
+):
+    set_verbose_logging(verbose)
+    logger.info("Running...")
+
+    # 評価用LLMの設定
+    eval_lm = get_lm()
+
+    # チャット推論用LLMの設定
+    chat_lm = get_lm()
+
+    # 日本語データセットの読み込み(ずんだもんスタイルの質問応答データ)
+    dataset = load_dataset("takaaki-inada/databricks-dolly-15k-ja-zundamon")
+
+    # データセットからDSPy形式のExampleオブジェクトを作成
+    # - query: 質問文
+    # - history: 会話履歴(今回は空リスト)
+    # - response: 期待される応答(学習用)
+    trainset = [
+        dspy.Example(query=item["instruction"], history=[], response=item["output"]).with_inputs(
+            "query", "history"
+        )  # 入力フィールドを指定
+        for item in list(dataset["train"])[:train_num]  # 最初の train_num 件を使用
+    ]
+
+    # MIPROv2を使用してチャットボットを最適化
+    optimized_bot = optimize_with_miprov2(trainset, eval_lm, chat_lm)
+
+    # 最適化されたモデルをファイルに保存
+    optimized_bot.save(OPTIMIZED_MODEL_PATH)
+    print(f"✅ モデルを保存しました: {OPTIMIZED_MODEL_PATH}")
+
+    # 保存したモデルを読み込んでテスト
+    test_bot = EdamameFairyBot()
+    test_bot.load(OPTIMIZED_MODEL_PATH)
+
+    # テスト用のLM設定(推論用)
+    dspy.configure(lm=chat_lm)
+
+    # テスト用のクエリ(様々なタイプの質問)
+    test_queries = ["こんにちは！", "枝豆って美味しいよね", "DSPyについて教えて"]
+
+    # テスト実行と結果表示
+    print("\n🧪 テスト結果:")
+    for query in test_queries:
+        # 最適化されたボットで応答を生成
+        result = test_bot(query=query, history=[])
+        print(f"Q: {query}")
+        print(f"A: {result.response}\n")
+
+
+if __name__ == "__main__":
+    load_dotenv(
+        override=True,
+        verbose=True,
+    )
+    app()
diff --git a/template_langgraph/internals/dspys/__init__.py b/template_langgraph/internals/dspys/__init__.py
diff --git a/template_langgraph/internals/dspys/modules.py b/template_langgraph/internals/dspys/modules.py
@@ -0,0 +1,24 @@
+import dspy
+
+
+class ConversationSignature(dspy.Signature):
+    """枝豆の妖精として対話する"""
+
+    query = dspy.InputField(desc="ユーザーからの質問や発言")
+    history = dspy.InputField(desc="過去の対話履歴", format=list, default=[])
+    response = dspy.OutputField(
+        desc="枝豆の妖精としての応答。語尾に「のだ」「なのだ」を自然に使い、一人称は「ボク」。親しみやすく可愛らしい口調で、日本語として自然な文章"
+    )
+
+
+class EdamameFairyBot(dspy.Module):
+    """枝豆の妖精スタイルのチャットボット"""
+
+    def __init__(self):
+        super().__init__()
+        self.respond = dspy.Predict(ConversationSignature)
+
+    def forward(self, query: str, history: list | None = None) -> dspy.Prediction:
+        if history is None:
+            history = []
+        return self.respond(query=query, history=history)
diff --git a/template_langgraph/internals/dspys/utilities.py b/template_langgraph/internals/dspys/utilities.py
diff --git a/uv.lock b/uv.lock