【CI】add rl case (#1482)

kkscilife · web-flow · commit bc3538306a8e · 2026-02-09T14:38:51.000+08:00
* add rl cases

* run cases

* run cases

* ready to PR

* remove push
diff --git a/autotest/cluster/clusterx.py b/autotest/cluster/clusterx.py
@@ -58,11 +58,18 @@ def execute_task(self, task_config: Dict[str, Any]):
             raise RuntimeError(f"clusterx job {job_name} start fail, task config is {task_config}, exception is: {e}")
 
         start_time = time.time()
+        run_start_time = None
 
         while True:
             status = self.get_task_status(job_schema.job_id)
+            if status in [JobStatus.RUNNING] and run_start_time is None:
+                run_start_time = time.time()
             if status in [JobStatus.SUCCEEDED]:
-                return True, "Task succeeded"
+                run_time = time.time() - run_start_time
+                if run_time >= timeout:
+                    return False, 'Task succeeded, but run time is {run_time}, exceeding then {timeout}'
+                else:
+                    return True, "Task succeeded"
             elif status in [JobStatus.FAILED, JobStatus.STOPPED]:
                 if status in [JobStatus.FAILED]:
                     time.sleep(10)
diff --git a/autotest/config.yaml b/autotest/config.yaml
@@ -208,3 +208,46 @@ case:
                     runtime_info/tgs: 0.05
                     runtime_info/text_tokens: 0
             timeout: 10800
+
+    qwen3-rl-lmdeploy:
+        -
+            type: rl
+            parameters:
+                config: autotest/config/rl_qwen3_gsk8k_grpo.py
+                infer_backend: lmdeploy
+                output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
+            resource:
+                envs:
+                    - MODEL_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3-8B
+                    - DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/gsm8k/train-mini.jsonl
+                    - EVAL_DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/gsm8k/test.jsonl
+                    - XTUNER_DETERMINISTIC=true
+            assert_info:
+                base_metric: qwen3-rl-lmdeploy/20260203/tracker.jsonl
+                check_metrics:
+                    - 
+                       metric:  eval/accuracy
+                       threshold: 0.05
+                       method: absolute
+                       operator: <
+                    -
+                       metric: response/rewards/mean
+                       threshold: 0.1
+                       method: absolute
+                       operator: <
+                    - 
+                       metric:  mismatch/mismatch_k3_kl
+                       threshold: 0.0001
+                       method: absolute
+                       operator: <=
+                    -
+                       metric: response/response_len/mean
+                       threshold: 0.12
+                       method: relative
+                       operator: <
+                    -
+                       metric: time/step
+                       threshold: 10
+                       method: absolute
+                       operator: <
+            timeout: 2460
diff --git a/autotest/module/train.py b/autotest/module/train.py
@@ -1,6 +1,6 @@
 import os
 
-from utils.check_metric import check_result
+from utils.check_metric import check_result, check_rl_result
 from utils.run_cmd import run_cmd
 
 
@@ -25,28 +25,37 @@ def get_cmd(config):
                 ]
             )
 
-            command = (
-                f"cd {current_dir}; pwd; pip install -e .[all]; pip install more-itertools; export GITHUB_RUN_ID={config.get('run_id')}; "
-                + f"torchrun --nproc-per-node {nproc_per_node} --master_addr=${{MASTER_ADDR}} --master_port=${{MASTER_PORT}} --nnodes=${{WORLD_SIZE}} --node_rank=${{RANK}} "
-                + f"xtuner/v1/train/cli/{train_type}.py"
-            )
-            if config_path:
-                output_path = model_config = config.get("parameters", {}).get("output_path", ".")
-                if output_path == ".":
-                    command += f" --config {config_path}; mkdir -p {work_dir}; mv {output_path}/.xtuner {work_dir}; mv {output_path}/202* {work_dir}"
+            if train_type == "sft":
+                command = (
+                    f"cd {current_dir}; pwd; pip install -e .[all]; pip install more-itertools; export GITHUB_RUN_ID={config.get('run_id')}; "
+                    + f"torchrun --nproc-per-node {nproc_per_node} --master_addr=${{MASTER_ADDR}} --master_port=${{MASTER_PORT}} --nnodes=${{WORLD_SIZE}} --node_rank=${{RANK}} "
+                    + f"xtuner/v1/train/cli/{train_type}.py"
+                )
+                if config_path:
+                    output_path = model_config = config.get("parameters", {}).get("output_path", ".")
+                    if output_path == ".":
+                        command += f" --config {config_path}; mkdir -p {work_dir}; mv {output_path}/.xtuner {work_dir}; mv {output_path}/202* {work_dir}"
+                    else:
+                        command += f" --config {config_path}"
                 else:
-                    command += f" --config {config_path}"
-            else:
-                if model_config:
-                    command += f" --model-cfg {model_config}"
-                if chat_template:
-                    command += f" --chat_template {chat_template}"
-                if dataset_path:
-                    command += f" --dataset {dataset_path}"
-                command += f" --work_dir {work_dir}"
+                    if model_config:
+                        command += f" --model-cfg {model_config}"
+                    if chat_template:
+                        command += f" --chat_template {chat_template}"
+                    if dataset_path:
+                        command += f" --dataset {dataset_path}"
+                    command += f" --work_dir {work_dir}"
 
-            config["work_dir"] = work_dir
-            return command, config
+                config["work_dir"] = work_dir
+                return command, config
+            elif train_type == "rl":
+                infer_type = config.get("parameters", {}).get("infer_backend", "lmdeploy")
+                config["work_dir"] = work_dir
+                command = (
+                    f"cd {current_dir}; pwd; pip install -e .[all]; export GITHUB_RUN_ID={config.get('run_id')}; export WORK_DIR={work_dir}; "
+                    + f"bash -x examples/v1/scripts/run_rl.sh {config_path} {infer_type} ${{MODEL_PATH}} ${{DATA_PATH}} ${{EVAL_DATA_PATH}}"
+                )
+                return command, config
         else:
             return "", config
 
@@ -55,9 +64,18 @@ def validate(config):
         base_path = os.path.join(
             config.get("base_path").get("base_baseline_path"), config.get("assert_info", {}).get("base_metric", None)
         )
-        cur_path = os.path.join(get_latest_subdir(work_dir), "logs/exp_tracking/rank0/tracker.jsonl")
-        check_metrics = config.get("assert_info", {}).get("check_metrics", {})
-        return check_result(config["case_name"], base_path, cur_path, check_metrics)
+        train_type = config.get("type")
+        if train_type == 'sft':
+            cur_path = os.path.join(get_latest_subdir(work_dir), "logs/exp_tracking/rank0/tracker.jsonl")
+            check_metrics = config.get("assert_info", {}).get("check_metrics", {})
+            return check_result(config["case_name"], base_path, cur_path, check_metrics)
+        elif train_type == 'rl':
+            cur_path = os.path.join(get_latest_subdir(work_dir), "exp_tracking/tracker.jsonl")
+            check_metrics = config.get("assert_info", {})
+            return check_rl_result(config["case_name"], base_path, cur_path, check_metrics)
+        else:
+            print("Unknown type: {train_type}")
+            return False
 
     def pre_action(config=None):
         action_info = config.get("pre_action", None)
@@ -71,7 +89,7 @@ def post_action(config=None):
 
 
 def get_latest_subdir(work_dir):
-    dirs = [d for d in os.listdir(work_dir) if os.path.isdir(os.path.join(work_dir, d))]
+    dirs = [d for d in os.listdir(work_dir) if os.path.isdir(os.path.join(work_dir, d)) and len(d) == 14 and d.isdigit()]
 
     if not dirs:
         return None
diff --git a/autotest/utils/check_metric.py b/autotest/utils/check_metric.py
@@ -20,7 +20,8 @@ def extract_value(file, metrics):
         for line in f:
             line = json.loads(line)
             for metric in metrics:
-                metric_all[metric].append(line[metric])
+                if metric in line:
+                     metric_all[metric].append(line[metric])
             total_step += 1
 
     return total_step, metric_all
@@ -154,5 +155,87 @@ def check_result(case_name, base_path, cur_path, check_metric):
     result = not fail_metric
     return result, f"Some metric check failed,{fail_metric}"
 
+def check_rl_result(case_name, base_path, cur_path, assert_info):
+    fail_metric = {}
+    check_metrics_list = assert_info["check_metrics"]
+
+    metric_list = [item["metric"] for item in check_metrics_list]
+
+    base_steps, base_metrics = extract_value(base_path, metric_list)
+    cur_steps, cur_metrics = extract_value(cur_path, metric_list)
+
+    assert (
+        cur_steps == base_steps
+    ), f"current steps is not equal to base steps, current steps: {cur_steps}, base steps: {base_steps}"
+
+    output_path = Path(f"../{os.environ.get('GITHUB_RUN_ID','0')}")
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    check_metric_dict = {item["metric"]: item["threshold"] for item in check_metrics_list}
+    plot_all(case_name, check_metric_dict, base_metrics, cur_metrics, output_path)
+
+    shutil.copytree(output_path, f"./{os.environ['GITHUB_RUN_ID']}", dirs_exist_ok=True)
+    write_to_summary(case_name, base_path, cur_path)
+
+    for config in check_metrics_list:
+        metric = config["metric"]
+        threshold = config["threshold"]
+        method = config["method"]  # 'absolute' or 'relative'
+        operator = config["operator"]  # '<' or '<='
+
+        max_error = 0.0
+        max_error_idx = 0
+        check_flag = True
+
+        for idx, (base_val, cur_val) in enumerate(
+            zip(base_metrics[metric], cur_metrics[metric])
+        ):
+            if method == "absolute":
+                error = round(abs(cur_val - base_val), 5)
+            elif method == "relative":
+                if abs(base_val) < 1e-10:
+                    error = float("inf") if abs(cur_val) > 1e-10 else 0.0
+                else:
+                    error = round(abs(cur_val - base_val) / abs(base_val), 5)
+            else:
+                raise ValueError(f"Unknown method: {method}")
+
+            if error > max_error:
+                max_error = error
+                max_error_idx = idx
+
+            if operator == "<":
+                if not (error < threshold):
+                    fail_metric[metric] = (
+                        f"{metric} error {error:.6f} not less than threshold {threshold} "
+                        f"(method: {method}, operator: {operator}) at step {idx}, "
+                        f"baseline: {base_val:.6f}, current: {cur_val:.6f}"
+                    )
+                    check_flag = False
+                    break
+            elif operator == "<=":
+                if not (error <= threshold):
+                    fail_metric[metric] = (
+                        f"{metric} error {error:.6f} not less than or equal to threshold {threshold} "
+                        f"(method: {method}, operator: {operator}) at step {idx}, "
+                        f"baseline: {base_val:.6f}, current: {cur_val:.6f}"
+                    )
+                    check_flag = False
+                    break
+            else:
+                raise ValueError(f"Unknown operator: {operator}")
+
+        if check_flag:
+            logger.info(
+                f"✓ {metric} check passed, max error is {max_error:.6f} at step {max_error_idx} "
+                f"(method: {method}, operator: {operator})"
+            )
+
+    result = not bool(fail_metric)
+    if result:
+        return result, "All metrics check passed."
+    else:
+        return result, f"Some metric check failed: {fail_metric}"
+
 if __name__ == "__main__":
     print(check_result("qwen3-sft", "./base/tracker.jsonl", "./current/tracker.jsonl",{"grad_norm":0.000001,"loss/reduced_llm_loss":0.000001,"lr":0,"memory/max_memory_GB":0.2,"runtime_info/tgs":0.05,"runtime_info/text_tokens":0}))