Non-verifiable Medicine QA Task (agentscope-ai#317)

hiyuchang · web-flow · commit 894858ba97f2 · 2025-10-14T12:10:41.000+08:00
diff --git a/docs/sphinx_doc/assets/grpo_rubric_reward.png b/docs/sphinx_doc/assets/grpo_rubric_reward.png
diff --git a/examples/grpo_rubric_as_reward/README.md b/examples/grpo_rubric_as_reward/README.md
@@ -0,0 +1,35 @@
+# Non-Verifiable Medicine QA
+
+This example shows how to use LLM judge and rubrics to compute reward for a non-verifiable medicine QA task. This is inspired by the [RaR-Implicit](https://arxiv.org/pdf/2507.17746) method.
+
+Before running this example, please make sure you have prepared the environment and the dataset [anisha2102/RaR-Medicine](https://huggingface.co/datasets/anisha2102/RaR-Medicine).
+
+The RaR-Medicine dataset contains around 20k QA pairs with rubrics in medicine domain. Unlike math scenarios, it is infeasible to obtain verifiable rewards for this dataset. Below is an example data sample:
+
+```json
+{
+    "question": "What is the most sensitive imaging modality for diagnosing a ureteric stone in a patient presenting with acute renal colic?",
+    "reference_answer": "The most sensitive imaging modality for diagnosing a ureteric stone in a patient presenting with acute renal colic is a non-contrast helical CT scan. This method is highly accurate, able to detect stones of varying sizes and compositions, and preferred due to its quick and reliable results without the need for contrast, making it the gold standard in such cases.",
+    "rubric": [
+        {
+            "description": "Essential Criteria: Identifies non-contrast helical CT scan as the most sensitive modality for ureteric stones.",
+            "title": "Identify Most Sensitive Modality",
+            "weight": 5
+        },
+        ...
+    ]
+}
+```
+
+In the RaR-Implicit method, the LLM judge scores a group of responses by evaluating them against the provided rubrics and outputs the score in the range of [0, 1] for each response. The higher the score, the better the response is according to the rubrics.
+
+
+The config file is located in [`rubric.yaml`](./rubric.yaml).
+To run this example, you can run the following command:
+```bash
+trinity run --config examples/grpo_rubric_as_reward/rubric.yaml
+```
+
+With the provided configurations, we can see the reward is increasing over the training steps:
+
+![reward](../../docs/sphinx_doc/assets/grpo_rubric_reward.png)
diff --git a/examples/grpo_rubric_as_reward/rubric.yaml b/examples/grpo_rubric_as_reward/rubric.yaml
@@ -0,0 +1,76 @@
+project: "Trinity-RFT-Example"
+name: "MedicineQA_grpo_rubric_as_reward"
+checkpoint_root_dir: ${oc.env:TRINITY_CHECKPOINT_ROOT_DIR,./checkpoints}
+algorithm:
+  algorithm_type: grpo
+  advantage_fn_args:
+    std_threshold: 0.0001  # effectively zero
+  repeat_times: 8
+model:
+  model_path: ${oc.env:TRINITY_MODEL_PATH,Qwen/Qwen3-4B-Instruct-2507}
+  max_response_tokens: 1024
+  max_model_len: 10240
+cluster:
+  node_num: 1
+  gpu_per_node: 8
+buffer:
+  total_epochs: 2
+  batch_size: 96
+  train_batch_size: 768 # 8*96
+  explorer_input:
+    taskset:
+      name: rar_medicine
+      storage_type: file
+      path: anisha2102/RaR-Medicine
+      split: train
+      format:
+        prompt_key: 'question'
+        response_key: 'reference_answer'
+      rollout_args:
+        temperature: 1.0
+      enable_progress_bar: false
+    default_workflow_type: 'rubric_judge_workflow'
+  trainer_input:
+    experience_buffer:
+      name: experience_buffer
+      storage_type: queue
+      use_priority_queue: true
+explorer:
+  eval_interval: 10
+  max_timeout: 3600
+  rollout_model:
+    engine_num: 2
+    tensor_parallel_size: 1
+    enable_prefix_caching: false
+    enforce_eager: true
+    dtype: bfloat16
+    seed: 42
+  auxiliary_models:
+    - model_path: Qwen/Qwen3-30B-A3B-Instruct-2507
+      engine_num: 1
+      tensor_parallel_size: 2
+      enable_thinking: false
+      max_prompt_tokens: 19456
+      max_response_tokens: 1024
+      max_model_len: 20480
+synchronizer:
+  sync_style: dynamic_by_explorer
+  sync_method: 'nccl'
+  sync_interval: 5
+  sync_timeout: 3600
+trainer:
+  save_interval: 100
+  trainer_config:
+    actor_rollout_ref:
+      model:
+        use_remove_padding: true
+      actor:
+        use_dynamic_bsz: true
+        ppo_max_token_len_per_gpu: 16384
+        ulysses_sequence_parallel_size: 1
+        optim:
+          lr: 2e-6
+      ref:
+        log_prob_use_dynamic_bsz: ${trainer.trainer_config.actor_rollout_ref.actor.use_dynamic_bsz}
+        log_prob_max_token_len_per_gpu: ${trainer.trainer_config.actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+        ulysses_sequence_parallel_size: ${trainer.trainer_config.actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
diff --git a/trinity/common/models/vllm_model.py b/trinity/common/models/vllm_model.py
@@ -83,6 +83,7 @@ def __init__(
             gpu_memory_utilization=config.gpu_memory_utilization,
             enable_chunked_prefill=config.enable_chunked_prefill,
             # max_num_batched_tokens=256, # you can further set this parameter to reduce the vllm peak memory usage
+            disable_log_stats=True,
             enable_lora=config.enable_lora,
             **config.lora_kwargs,
         )
diff --git a/trinity/common/workflows/__init__.py b/trinity/common/workflows/__init__.py
@@ -45,6 +45,7 @@
 from trinity.common.workflows.math_trainable_ruler_workflow import (
     MathTrainableRULERWorkflow,
 )
+from trinity.common.workflows.rubric_judge_workflow import RubricJudgeWorkflow
 from trinity.common.workflows.simple_mm_workflow import (
     AsyncSimpleMMWorkflow,
     SimpleMMWorkflow,
@@ -90,4 +91,5 @@
     "MathTrainableRULERWorkflow",
     "AsyncSimpleMMWorkflow",
     "SimpleMMWorkflow",
+    "RubricJudgeWorkflow",
 ]
diff --git a/trinity/common/workflows/rubric_judge_workflow.py b/trinity/common/workflows/rubric_judge_workflow.py
@@ -0,0 +1,172 @@
+# -*- coding: utf-8 -*-
+"""A workflow with LLM-as-a-judge."""
+import json
+from typing import List, Optional, Tuple
+
+import openai
+
+from trinity.common.experience import Experience
+from trinity.common.models.model import ModelWrapper
+from trinity.common.workflows.workflow import WORKFLOWS, SimpleWorkflow, Task
+
+
+@WORKFLOWS.register_module("rubric_judge_workflow")
+class RubricJudgeWorkflow(SimpleWorkflow):
+    """A workflow using LLM-as-a-judge and rubrics to get the reward.
+
+    Adapted from https://arxiv.org/pdf/2507.17746
+    """
+
+    def __init__(
+        self,
+        *,
+        task: Task,
+        model: ModelWrapper,
+        auxiliary_models: Optional[List[openai.OpenAI]] = None,
+    ):
+        super().__init__(
+            task=task,
+            model=model,
+            auxiliary_models=auxiliary_models,
+        )
+
+    def reset(self, task: Task):
+        """Modified from SimpleWorkflow.reset"""
+        self.format_args = task.format_args
+        self.system_prompt = task.format_args.system_prompt
+        self.reply_prefix = task.format_args.reply_prefix
+
+        if self.system_prompt is None:
+            self.system_prompt = """A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer.
+    """
+
+        self.raw_task = task.raw_task
+        self.task_desc = task.task_desc
+        self.truth = task.truth
+        self.rubric = self.raw_task.get("rubric", [])
+
+    def run(self) -> List[Experience]:
+        """Modified from SimpleWorkflow.run"""
+
+        messages = self.format_messages()
+
+        self.logger.debug("start chat")
+        responses = self.model.chat(messages, **self.rollout_args)
+
+        # === Calculate rubric-based rewards ===
+        assert (
+            self.auxiliary_models is not None
+        ), "Current implementation of rubric-based rewards requires that auxiliary_models is not None."
+
+        judge_success_list = []
+        for i, response in enumerate(responses):
+            judge_success, reward = self.get_judge_reward(
+                response=response.response_text, judger=self.auxiliary_models[0]
+            )
+            response.reward = reward
+            response.eid.run = i + self.run_id_base
+
+            judge_success_list.append(judge_success)
+
+            if i == 0:
+                self.logger.debug(
+                    f"self.task_desc: {self.task_desc}, messages: {messages}, response: {response.response_text}, reward: {response.reward}"
+                )
+
+        # record judge success
+        judge_success_rate = (
+            sum(judge_success_list) / len(judge_success_list) if judge_success_list else 0.0
+        )
+        for response in responses:
+            if response.metrics is None:
+                response.metrics = {}
+            response.metrics.update({"judge_success": float(judge_success_rate)})
+
+        return responses
+
+    def get_judge_reward(self, response: str, judger: openai.OpenAI) -> Tuple[bool, float]:
+        """Get rewards with LLM-as-a-judge
+        The prompts are adapted from RAR-IMPLICIT method in https://arxiv.org/pdf/2507.17746
+        """
+        # Step 1: format prompts
+        # system prompt
+        ruler_system_prompt = """You are an expert evaluator. Given a user prompt, a generated response, and a list of quality rubrics, please rate the overall quality of the response on a scale of 1 to 10 based on how well it satisfies the rubrics.
+Consider all rubrics holistically when determining your score. A response that violates multiple rubrics should receive a lower score, while a response that satisfies all rubrics should receive a higher score.
+Start your response with a valid JSON object that starts with "```json" and ends with "```". The JSON object should contain
+a single key "rating" and the value should be an integer between 1 and 10.
+Example response:
+```json
+{
+"rating": 7
+}```"""
+        # user prompt
+        if len(self.rubric) > 0:
+            rubric_prompt_parts = [
+                f"Rubric {i} (weight: {single_rubric['weight']}): {single_rubric['description']}"
+                for i, single_rubric in enumerate(self.rubric)
+            ]
+            rubric_list_string = "\n".join(rubric_prompt_parts)
+        else:
+            self.logger.warning("No rubric is provided!")
+            rubric_list_string = "Rubrics are not provided."
+
+        ruler_user_prompt = f"""Given the following prompt, response, and rubrics, please rate the overall quality of the response on a scale of 1 to 10 based
+on how well it satisfies the rubrics.
+<prompt>
+{self.task_desc}
+</prompt>
+<response>
+{response}
+</response>
+<rubrics>
+{rubric_list_string}
+</rubrics>
+Your JSON Evaluation:
+""".strip()
+
+        # Step 2: invoke judger LLM
+        messages = [
+            {"role": "system", "content": ruler_system_prompt},
+            {"role": "user", "content": ruler_user_prompt},
+        ]
+        completion = judger.chat.completions.create(
+            model=judger.model_path, messages=messages, stream=False, temperature=0.0
+        )
+        judger_response = completion.choices[0].message.content
+        self.logger.debug(f"LLM judge response: {judger_response}")
+
+        # Step 3: extract score from judger's response (expecting a JSON block with "rating")
+        try:
+            # Extract content between ```json and ```
+            start_tag = "```json"
+            start_index = judger_response.find(start_tag)
+            if start_index == -1:
+                start_tag = "```"
+                start_index = judger_response.find(start_tag)
+
+            if start_index == -1:
+                self.logger.warning("No JSON code block found in judger response.")
+                return False, 0.0
+
+            end_index = judger_response.find("```", start_index + len(start_tag))
+            if end_index == -1:
+                self.logger.warning("Malformed JSON code block in judger response.")
+                return False, 0.0
+
+            json_str = judger_response[start_index + len(start_tag) : end_index].strip()
+            parsed = json.loads(json_str)
+            rating = parsed.get("rating")
+
+            if not isinstance(rating, (int, float)) or not (1 <= rating <= 10):
+                self.logger.warning(f"Invalid or out-of-range rating: {rating}")
+                return False, 0.0
+
+            normalized_score = rating * 0.1  # Normalize 1-10 to 0-1 scale
+            return True, normalized_score
+
+        except json.JSONDecodeError as e:
+            self.logger.warning(f"Failed to parse JSON from judger response: {e}")
+            return False, 0.0
+        except Exception as e:
+            self.logger.warning(f"Unexpected error when processing judger response: {e}")
+            return False, 0.0
diff --git a/trinity/trainer/verl_trainer.py b/trinity/trainer/verl_trainer.py
@@ -41,7 +41,7 @@
 
 class CheckpointMonitor:
     def __init__(self, default_local_dir: str, default_hdfs_dir: str = None):
-        self.logger = get_logger("Checkpoint Monitor", in_ray_actor=True)
+        self.logger = get_logger("checkpoint_monitor", in_ray_actor=True)
         self.default_local_dir = default_local_dir
         self.default_hdfs_dir = default_hdfs_dir
         self.local_latest_checkpointed_iteration = os.path.join(

Original file line number	Diff line number	Diff line change
`@@ -83,6 +83,7 @@ def __init__(`
`83`	`83`	`gpu_memory_utilization=config.gpu_memory_utilization,`
`84`	`84`	`enable_chunked_prefill=config.enable_chunked_prefill,`
`85`	`85`	`# max_num_batched_tokens=256, # you can further set this parameter to reduce the vllm peak memory usage`
	`86`	`+ disable_log_stats=True,`
`86`	`87`	`enable_lora=config.enable_lora,`
`87`	`88`	`**config.lora_kwargs,`
`88`	`89`	`)`
Original file line number	Diff line number	Diff line change
`@@ -45,6 +45,7 @@`
`45`	`45`	`from trinity.common.workflows.math_trainable_ruler_workflow import (`
`46`	`46`	`MathTrainableRULERWorkflow,`
`47`	`47`	`)`
	`48`	`+from trinity.common.workflows.rubric_judge_workflow import RubricJudgeWorkflow`
`48`	`49`	`from trinity.common.workflows.simple_mm_workflow import (`
`49`	`50`	`AsyncSimpleMMWorkflow,`
`50`	`51`	`SimpleMMWorkflow,`
`@@ -90,4 +91,5 @@`
`90`	`91`	`"MathTrainableRULERWorkflow",`
`91`	`92`	`"AsyncSimpleMMWorkflow",`
`92`	`93`	`"SimpleMMWorkflow",`
	`94`	`+ "RubricJudgeWorkflow",`
`93`	`95`	`]`