diff --git a/environment.yml b/environment.yml index 6c9b4896..63ad8526 100644 --- a/environment.yml +++ b/environment.yml @@ -69,6 +69,7 @@ dependencies: - pycocotools>=2.0.8 - vllm>=0.8.5 - latex2sympy2>=1.9.1 + - bfcl-eval>=2025.10.13 - parameterized>=0.9.0 - pyseccomp>=0.1.2 prefix: /home/sayouse/miniconda3/envs/myenv \ No newline at end of file diff --git a/eureka_ml_insights/data_utils/bfcl_multiturn_utils.py b/eureka_ml_insights/data_utils/bfcl_multiturn_utils.py new file mode 100644 index 00000000..a8e9f8b1 --- /dev/null +++ b/eureka_ml_insights/data_utils/bfcl_multiturn_utils.py @@ -0,0 +1,57 @@ +import re, ast +from dataclasses import dataclass + +import pandas as pd +from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import ( + execute_multi_turn_func_call, +) + +from .transform import DFTransformBase + + +@dataclass +class BFCLMultiturnExecuteCall(DFTransformBase): + model_output_column: str + model_answer_column: str + + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + df[self.model_answer_column] = df.apply(self.execuate_model_output, axis=1) + return df + + @staticmethod + def execuate_model_output(df_row): + """ + Execute the model output to get the function output. + + Parameters: + df_row (a row in dataframe): Input is a dataframe row containing model_output, initial_config, and, involved_classes. + Returns: + " ".join(execution_results) (str): A string denoting the execution of the function calls. + """ + test_entry = df_row + response_text = test_entry["model_output"] + try: + initial_config = ast.literal_eval(test_entry["initial_config"]) + except (ValueError, SyntaxError) as e: + raise ValueError(f"Invalid initial_config format: {test_entry['initial_config']}") from e + try: + involved_classes: list = eval(test_entry["involved_classes"]) + except (ValueError, SyntaxError) as e: + raise ValueError(f"Invalid involved_classes format: {test_entry['involved_classes']}") from e + test_entry_id: str = test_entry["id"] + test_category: str = test_entry_id.rsplit("_", 1)[0] + + func_calls = re.findall(r"\w+\([^)]*\)", response_text) + if len(func_calls) == 0: + return "No call executed" + + execution_results, involved_instances = execute_multi_turn_func_call( + func_call_list=func_calls, + initial_config=initial_config, + involved_classes=involved_classes, + model_name="", + test_entry_id=test_entry_id, + long_context=("long_context" in test_category or "composite" in test_category), + is_evaL_run=False, + ) + return " ".join(execution_results) diff --git a/eureka_ml_insights/metrics/bfcl_metrics.py b/eureka_ml_insights/metrics/bfcl_metrics.py new file mode 100644 index 00000000..f76a359d --- /dev/null +++ b/eureka_ml_insights/metrics/bfcl_metrics.py @@ -0,0 +1,57 @@ +from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_checker import ( + multi_turn_checker, +) + +from eureka_ml_insights.metrics.metrics_base import ClassicMetric + + +class BFCLMultiturnMatch(ClassicMetric): + """This metric class checks if the generated function calls match the ground-truth.""" + + def __init__( + self, + model_output_col: str = "model_output", + ground_truth_col: str = "ground_truth", + initial_config_col: str = "initial_config", + involved_classes_col: str = "involved_classes", + test_entry_id_col: str = "id", + ): + super().__init__() + self.model_output_col = model_output_col + self.ground_truth_col = ground_truth_col + self.initial_config_col = initial_config_col + self.involved_classes_col = involved_classes_col + self.test_entry_id_col = test_entry_id_col + + def __evaluate__(self, answer_text, target_text, initial_config, involved_classes, test_entry_id): + test_entry = { + "initial_config": eval(initial_config), + "involved_classes": eval(involved_classes), + "id": test_entry_id, + } + test_category = "" # following the bfcl_eval's original eval code. + model_name = "" # following the bfcl_eval's original eval code. + multi_turn_ground_truth_list = eval(target_text) + + accuracy_checker_result = multi_turn_checker( + answer_text, + multi_turn_ground_truth_list, + test_entry, + test_category, + model_name, + ) + return str(accuracy_checker_result["valid"]) + + def evaluate(self, data): + self.validate_data(data) + data[self.__class__.__name__ + "_result"] = data.apply( + lambda x: self.__evaluate__( + x[self.model_output_col], + x[self.ground_truth_col], + x[self.initial_config_col], + x[self.involved_classes_col], + x[self.test_entry_id_col], + ), + axis=1, + ) + return data diff --git a/setup.py b/setup.py index 17cc9964..95b580b0 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,4 @@ -from setuptools import setup, find_packages - +from setuptools import find_packages, setup setup( name='eureka_ml_insights', @@ -44,6 +43,7 @@ 'pycocotools>=2.0.8', 'vllm>=0.8.5', 'latex2sympy2>=1.9.1', + 'bfcl-eval>=2025.10.13', 'parameterized>=0.9.0', 'pyseccomp>=0.1.2', ], @@ -54,9 +54,9 @@ }, classifiers=[ # Full list at https://pypi.org/classifiers/ - 'Programming Language :: Python :: 3', - 'License :: Apache License 2.0', - 'Operating System :: OS Independent', + "Programming Language :: Python :: 3", + "License :: Apache License 2.0", + "Operating System :: OS Independent", ], - python_requires='>=3.8', + python_requires=">=3.8", )