Remove env variables

kcz358 · kcz358 · commit ac57c7a5fcbe · 2025-11-25T06:07:24.000Z
diff --git a/custom_rewards/vl_agent.py b/custom_rewards/vl_agent.py
@@ -26,7 +26,7 @@
 
 openai_api_key = "EMPTY"
 openai_api_base_list = [
-    os.environ.get("LLM_AS_A_JUDGE_BASE", "https://sd285v869b9467c7sab70.apigateway-cn-shanghai.volceapi.com/v1"),
+    os.environ.get("LLM_AS_A_JUDGE_BASE", "YOUR_API_BASE"), # e.g. http://localhost:8000/v1
 ]
 
 client_list = []
@@ -453,94 +453,6 @@ def compute_score(predict_str: str, ground_truth: str, extra_info=None, **kwargs
     else:
         return (0.8 * acc_reward + 0.2 * format_reward, acc_reward, format_reward)
 
-
-# def rule_math_verify(ground_truth, model_answer):
-#     gold = parse(ground_truth)
-#     answer = parse(model_answer)
-#     return verify(gold, answer)
-
-
-# def generative_verify(query, ground_truth, model_answer):
-#     client_idx = random.randint(0, len(client_list) - 1)
-#     client = client_list[client_idx]
-#     model_name = model_name_list[client_idx]
-
-#     full_prompt = MATH_VERIFY_PROMPT.format(
-#         query=query,
-#         gold_ans=ground_truth,
-#         pred_ans=model_answer,
-#     )
-
-#     response = ""
-#     for it in range(8):
-#         try:
-#             chat_response = client.chat.completions.create(
-#                 model=model_name,
-#                 messages=[
-#                     {"role": "user", "content": full_prompt},
-#                 ],
-#                 seed=random.randint(0, 1000000),
-#                 temperature=0.0,
-#             )
-#             response = chat_response.choices[0].message.content.strip()
-#             break
-#         except Exception as e:
-#             print(f" [ERROR math] generative_verify error: {e}")
-#             continue
-
-#     judgement = response.split("## Equivalence Judgement")[-1].lower()
-#     if "true" in judgement and "false" not in judgement:
-#         return True
-#     elif "false" in judgement and "true" not in judgement:
-#         return False
-#     else:
-#         print(" [ERROR math] verify bug output: ")
-
-
-# def compute_score_math(predict_str: str, ground_truth: str, extra_info=None) -> float:
-#     is_format_error = False
-#     # predict_str = "<think>" + predict_str
-#     count_think_1 = predict_str.count("<think>")
-#     count_think_2 = predict_str.count("</think>")
-#     if count_think_1 != count_think_2 or count_think_1 == 0:  # reward hacking
-#         is_format_error = True
-
-#     predict_no_think = predict_str.split("</think>")[-1].strip()
-#     count_answer_1 = predict_no_think.count("<answer>")
-#     count_answer_2 = predict_no_think.count("</answer>")
-#     if count_answer_1 != count_answer_2 or count_answer_1 == 0:
-#         is_format_error = True
-
-#     # extract answer content from answer tag
-#     if count_answer_1 == 0 or count_answer_2 == 0:
-#         answer_content = ""
-#     else:
-#         answer_content = predict_str.split("<answer>")[-1].split("</answer>")[0].strip()
-
-#     model_answer = ""
-#     if answer_content == "":
-#         acc_reward = 0.0
-#     else:
-#         answer_pattern = r"\\boxed{([^}]+)}"
-#         answer_list = re.findall(answer_pattern, answer_content, flags=re.DOTALL)
-#         if len(answer_list) == 0:
-#             acc_reward = 0.0
-#             is_format_error = True
-#         else:
-#             if len(answer_list) > 1:
-#                 is_format_error = True
-
-#             model_answer = answer_list[-1]
-#             if rule_math_verify(ground_truth, model_answer):
-#                 acc_reward = 1.0
-#             else:
-#                 acc_reward = 1.0 if generative_verify(extra_info["question"], ground_truth, model_answer) else 0.0
-
-#     format_reward = 0.0 if is_format_error else 1.0
-
-#     return 0.8 * acc_reward + 0.2 * format_reward, acc_reward, format_reward
-
-
 def compute_score_time_r1(predict_str: str, ground_truth: str, extra_info=None, use_recall=False) -> float:
     is_format_error = False
     # predict_str = "<think>" + predict_str
@@ -633,166 +545,3 @@ def compute_score_time_r1(predict_str: str, ground_truth: str, extra_info=None,
     format_reward = 0.0 if is_format_error else 1.0
 
     return 1.0 * acc_reward + 1.0 * format_reward, acc_reward, format_reward
-
-
-# def compute_score_videor1(predict_str: str, ground_truth: str, extra_info=None, **kwargs) -> float:
-#     """
-#     Video-R1 style reward computation with accuracy and format rewards.
-
-#     Args:
-#         predict_str: Model prediction string
-#         ground_truth: Ground truth answer
-#         extra_info: Dictionary containing additional info like 'question' and 'problem_type'
-#         **kwargs: Additional arguments like temporal, len_control settings
-
-#     Returns:
-#         Tuple of (total_reward, accuracy_reward, format_reward) or just total_reward
-#     """
-
-#     def extract_answer_videor1(text):
-#         """Extract answer from <answer></answer> tags"""
-#         pattern = r"<answer>\s*(.*?)\s*</answer>"
-#         match = re.search(pattern, text, re.DOTALL)
-#         if match:
-#             return match.group(1).strip()
-#         return ""
-
-#     def normalize_number(num_str):
-#         """Normalize number string to float"""
-#         try:
-#             num_str = num_str.replace(",", "")
-#             return float(num_str)
-#         except Exception as e:
-#             print(f"Error converting '{num_str}' to float: {e}")
-#             return None
-
-#     def wer(reference, hypothesis):
-#         """Word Error Rate calculation"""
-#         ref_words = reference.split()
-#         hyp_words = hypothesis.split()
-#         m = len(ref_words)
-#         n = len(hyp_words)
-#         d = [[0] * (n + 1) for _ in range(m + 1)]
-#         for i in range(m + 1):
-#             d[i][0] = i
-#         for j in range(n + 1):
-#             d[0][j] = j
-#         for i in range(1, m + 1):
-#             for j in range(1, n + 1):
-#                 if ref_words[i - 1] == hyp_words[j - 1]:
-#                     d[i][j] = d[i - 1][j - 1]
-#                 else:
-#                     d[i][j] = 1 + min(d[i - 1][j], d[i][j - 1], d[i - 1][j - 1])
-#         return d[m][n] / max(1, m)
-
-#     def compute_rouge_score(reference, hypothesis, use_stemmer=True):
-#         """Compute ROUGE score"""
-#         scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=use_stemmer)
-#         scores = scorer.score(reference, hypothesis)
-#         average_fmeasure = (scores["rouge1"].fmeasure + scores["rouge2"].fmeasure + scores["rougeL"].fmeasure) / 3
-#         return average_fmeasure
-
-#     # Initialize variables
-#     is_format_error = False
-
-#     # Format checking - Video-R1 style
-#     count_think_1 = predict_str.count("<think>")
-#     count_think_2 = predict_str.count("</think>")
-#     count_answer_1 = predict_str.count("<answer>")
-#     count_answer_2 = predict_str.count("</answer>")
-
-#     # Check format requirements
-#     if count_think_1 != count_think_2 or count_think_1 == 0:
-#         is_format_error = True
-#     if count_answer_1 != count_answer_2 or count_answer_1 == 0:
-#         is_format_error = True
-
-#     # Check basic format pattern
-#     pattern = r"<think>.*?</think>\s*<answer>.*?</answer>"
-#     if not re.search(pattern, predict_str, re.DOTALL):
-#         is_format_error = True
-
-#     # Extract answer
-#     if count_answer_1 == 0 or count_answer_2 == 0:
-#         answer_text = ""
-#     else:
-#         answer_text = extract_answer_videor1(predict_str)
-
-#     # Compute accuracy reward
-#     # If there's a format error, set accuracy reward to 0 regardless of answer content
-#     if answer_text == "" or is_format_error:
-#         acc_reward = 0.0
-#     else:
-#         try:
-#             # Get problem type from extra_info
-#             problem_type = extra_info.get("problem_type", "free-form") if extra_info else "free-form"
-
-#             output_ans = answer_text
-#             gt_ans = extract_answer_videor1(ground_truth) if "<answer>" in ground_truth else ground_truth
-
-#             if problem_type == "multiple choice":
-#                 acc_reward = 1.0 if output_ans.strip() == gt_ans.strip() else 0.0
-#             elif problem_type == "numerical":
-#                 gt_has_decimal = ("." in gt_ans) or ("," in gt_ans)
-#                 out_has_decimal = ("." in output_ans) or ("," in output_ans)
-#                 if gt_has_decimal != out_has_decimal:
-#                     acc_reward = 0.0
-#                 else:
-#                     gt_number = normalize_number(gt_ans)
-#                     out_number = normalize_number(output_ans)
-#                     if gt_number is None or out_number is None:
-#                         acc_reward = 0.0
-#                     else:
-#                         acc_reward = 1.0 if round(gt_number, 2) == round(out_number, 2) else 0.0
-#             elif problem_type == "OCR":
-#                 error_rate = wer(gt_ans, output_ans)
-#                 acc_reward = 1 - error_rate
-#                 acc_reward = max(0.0, min(1.0, acc_reward))
-#             elif problem_type == "free-form":
-#                 score = compute_rouge_score(gt_ans, output_ans)
-#                 acc_reward = max(0.0, min(1.0, score))
-#             elif problem_type == "regression":
-#                 gt_number = normalize_number(gt_ans)
-#                 out_number = normalize_number(output_ans)
-#                 if gt_number is None or out_number is None:
-#                     acc_reward = 0.0
-#                 else:
-#                     rel_diff = (abs(out_number - gt_number) + 1e-9) / (abs(gt_number) + 1e-9)
-#                     rel_diff = min(1.0, max(0.0, rel_diff))
-#                     acc_reward = 1 - rel_diff
-#             else:
-#                 # Unknown problem type - return 0 (same as original Video-R1)
-#                 acc_reward = 0.0
-
-#         except Exception as e:
-#             print(f"Error in Video-R1 accuracy reward computation: {e}")
-#             acc_reward = 0.0
-
-#     # Penalize for overly long answers
-#     if len(answer_text) >= 1000:
-#         acc_reward = 0.0
-#         is_format_error = True
-
-#     # Format reward
-#     format_reward = 0.0 if is_format_error else 1.0
-
-#     # Debug logging
-#     if os.getenv("DEBUG_MODE") == "true":
-#         log_path = os.getenv("LOG_PATH", "./videor1_debug.log")
-#         current_time = datetime.now().strftime("%d-%H-%M-%S-%f")
-#         with open(log_path, "a", encoding="utf-8") as f:
-#             f.write(f"------------- {current_time} Video-R1 Reward -------------\n")
-#             f.write(f"Prediction: {predict_str}\n")
-#             f.write(f"Ground Truth: {ground_truth}\n")
-#             f.write(f"Extracted Answer: {answer_text}\n")
-#             f.write(f"Accuracy Reward: {acc_reward}\n")
-#             f.write(f"Format Reward: {format_reward}\n")
-#             f.write(f"Format Error: {is_format_error}\n")
-#             f.write("=" * 50 + "\n")
-
-#     return (acc_reward + format_reward, acc_reward, format_reward)
-
-
-# if __name__ == "__main__":
-#     # 测试新的Video-R1 reward函数
-#     test_compute_score_videor1()