From 682b1cc814a2b081a63cb665e8a33595b61d8e9a Mon Sep 17 00:00:00 2001 From: lchen001 Date: Mon, 4 Aug 2025 23:49:07 +0000 Subject: [PATCH 1/4] add bfcl utility --- .../data_utils/bfcl_multiturn_utils.py | 53 +++++++++++++++++++ eureka_ml_insights/metrics/bfcl_metrics.py | 51 ++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 eureka_ml_insights/data_utils/bfcl_multiturn_utils.py create mode 100644 eureka_ml_insights/metrics/bfcl_metrics.py diff --git a/eureka_ml_insights/data_utils/bfcl_multiturn_utils.py b/eureka_ml_insights/data_utils/bfcl_multiturn_utils.py new file mode 100644 index 00000000..1cb2e87a --- /dev/null +++ b/eureka_ml_insights/data_utils/bfcl_multiturn_utils.py @@ -0,0 +1,53 @@ +import re, json, ast +from dataclasses import dataclass + +import pandas as pd + +from .transform import DFTransformBase + +from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import ( + execute_multi_turn_func_call +) + +@dataclass +class BFCLMultiturnExecuteCall(DFTransformBase): + model_output_column: str + model_answer_column: str + + def transform(self, df: pd.DataFrame) -> pd.DataFrame: + df[self.model_answer_column] = df.apply(self.execuate_model_output,axis=1) + return df + + @staticmethod + def execuate_model_output(response): + """ + Execute the model output to get the function output. + + Parameters: + response (str): Input string containing answer X in the form of "Final Answer: X". + Returns: + numerical_value (float or str): A numeric value or JSON string representing the model's answer. + """ + test_entry = response + response_text = test_entry["model_output"] + initial_config: dict = eval(test_entry["initial_config"]) + involved_classes: list = eval(test_entry["involved_classes"]) + test_entry_id: str = test_entry["id"] + test_category: str = test_entry_id.rsplit("_", 1)[0] + + func_calls = re.findall(r'\w+\([^)]*\)', response_text) + if(len(func_calls)==0): + return "No call executed" + + execution_results, involved_instances = execute_multi_turn_func_call( + func_call_list = func_calls, + initial_config = initial_config, + involved_classes = involved_classes, + model_name = "", + test_entry_id=test_entry_id, + long_context = ( + "long_context" in test_category or "composite" in test_category + ), + is_evaL_run=False, + ) + return " ".join(execution_results) diff --git a/eureka_ml_insights/metrics/bfcl_metrics.py b/eureka_ml_insights/metrics/bfcl_metrics.py new file mode 100644 index 00000000..a210b636 --- /dev/null +++ b/eureka_ml_insights/metrics/bfcl_metrics.py @@ -0,0 +1,51 @@ +from eureka_ml_insights.metrics.metrics_base import ClassicMetric + +from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_checker import ( + multi_turn_checker) + +class BFCLMultiturnMatch(ClassicMetric): + """This metric class checks if two dictionary strings represent the same dictionary.""" + + def __init__(self, model_output_col: str = "model_output", + ground_truth_col: str = "ground_truth", + initial_config_col:str = "initial_config", + involved_classes_col: str = "involved_classes", + test_entry_id_col: str = "id", + ): + super().__init__() + self.model_output_col = model_output_col + self.ground_truth_col = ground_truth_col + self.initial_config_col = initial_config_col + self.involved_classes_col = involved_classes_col + self.test_entry_id_col = test_entry_id_col + + def __evaluate__(self, answer_text, target_text,initial_config,involved_classes,test_entry_id): + test_entry = {"initial_config":eval(initial_config), + "involved_classes":eval(involved_classes), + "id":test_entry_id, + } + test_entry = test_entry + test_category = "" + model_name = "" + multi_turn_ground_truth_list = eval(target_text) + + accuracy_checker_result = multi_turn_checker( + answer_text, + multi_turn_ground_truth_list, + test_entry, + test_category, + model_name, + ) + return str(accuracy_checker_result['valid']) + + def evaluate(self, data): + self.validate_data(data) + data[self.__class__.__name__ + "_result"] = data.apply( + lambda x: self.__evaluate__(x[self.model_output_col], + x[self.ground_truth_col], + x[self.initial_config_col], + x[self.involved_classes_col], + x[self.test_entry_id_col], + ), axis=1 + ) + return data From 1f56767ccc1cc035bc1eaecf88f67b8b96581f63 Mon Sep 17 00:00:00 2001 From: lchen001 Date: Tue, 14 Oct 2025 00:56:58 +0000 Subject: [PATCH 2/4] update the doc strings and add some comments --- .../data_utils/bfcl_multiturn_utils.py | 42 +++++++-------- eureka_ml_insights/metrics/bfcl_metrics.py | 54 ++++++++++--------- 2 files changed, 50 insertions(+), 46 deletions(-) diff --git a/eureka_ml_insights/data_utils/bfcl_multiturn_utils.py b/eureka_ml_insights/data_utils/bfcl_multiturn_utils.py index 1cb2e87a..4e82a5b3 100644 --- a/eureka_ml_insights/data_utils/bfcl_multiturn_utils.py +++ b/eureka_ml_insights/data_utils/bfcl_multiturn_utils.py @@ -1,13 +1,13 @@ -import re, json, ast +import re from dataclasses import dataclass import pandas as pd +from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import ( + execute_multi_turn_func_call, +) from .transform import DFTransformBase -from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import ( - execute_multi_turn_func_call -) @dataclass class BFCLMultiturnExecuteCall(DFTransformBase): @@ -15,39 +15,37 @@ class BFCLMultiturnExecuteCall(DFTransformBase): model_answer_column: str def transform(self, df: pd.DataFrame) -> pd.DataFrame: - df[self.model_answer_column] = df.apply(self.execuate_model_output,axis=1) + df[self.model_answer_column] = df.apply(self.execuate_model_output, axis=1) return df @staticmethod - def execuate_model_output(response): + def execuate_model_output(df_row): """ Execute the model output to get the function output. - + Parameters: - response (str): Input string containing answer X in the form of "Final Answer: X". + df_row (a row in dataframe): Input is a dataframe row containing model_output, initial_config, and, involved_classes. Returns: - numerical_value (float or str): A numeric value or JSON string representing the model's answer. + " ".join(execution_results) (str): A string denoting the execution of the function calls. """ - test_entry = response + test_entry = df_row response_text = test_entry["model_output"] initial_config: dict = eval(test_entry["initial_config"]) involved_classes: list = eval(test_entry["involved_classes"]) test_entry_id: str = test_entry["id"] test_category: str = test_entry_id.rsplit("_", 1)[0] - func_calls = re.findall(r'\w+\([^)]*\)', response_text) - if(len(func_calls)==0): + func_calls = re.findall(r"\w+\([^)]*\)", response_text) + if len(func_calls) == 0: return "No call executed" - + execution_results, involved_instances = execute_multi_turn_func_call( - func_call_list = func_calls, - initial_config = initial_config, - involved_classes = involved_classes, - model_name = "", - test_entry_id=test_entry_id, - long_context = ( - "long_context" in test_category or "composite" in test_category - ), - is_evaL_run=False, + func_call_list=func_calls, + initial_config=initial_config, + involved_classes=involved_classes, + model_name="", + test_entry_id=test_entry_id, + long_context=("long_context" in test_category or "composite" in test_category), + is_evaL_run=False, ) return " ".join(execution_results) diff --git a/eureka_ml_insights/metrics/bfcl_metrics.py b/eureka_ml_insights/metrics/bfcl_metrics.py index a210b636..f76a359d 100644 --- a/eureka_ml_insights/metrics/bfcl_metrics.py +++ b/eureka_ml_insights/metrics/bfcl_metrics.py @@ -1,17 +1,21 @@ +from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_checker import ( + multi_turn_checker, +) + from eureka_ml_insights.metrics.metrics_base import ClassicMetric -from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_checker import ( - multi_turn_checker) class BFCLMultiturnMatch(ClassicMetric): - """This metric class checks if two dictionary strings represent the same dictionary.""" + """This metric class checks if the generated function calls match the ground-truth.""" - def __init__(self, model_output_col: str = "model_output", - ground_truth_col: str = "ground_truth", - initial_config_col:str = "initial_config", - involved_classes_col: str = "involved_classes", - test_entry_id_col: str = "id", - ): + def __init__( + self, + model_output_col: str = "model_output", + ground_truth_col: str = "ground_truth", + initial_config_col: str = "initial_config", + involved_classes_col: str = "involved_classes", + test_entry_id_col: str = "id", + ): super().__init__() self.model_output_col = model_output_col self.ground_truth_col = ground_truth_col @@ -19,16 +23,16 @@ def __init__(self, model_output_col: str = "model_output", self.involved_classes_col = involved_classes_col self.test_entry_id_col = test_entry_id_col - def __evaluate__(self, answer_text, target_text,initial_config,involved_classes,test_entry_id): - test_entry = {"initial_config":eval(initial_config), - "involved_classes":eval(involved_classes), - "id":test_entry_id, + def __evaluate__(self, answer_text, target_text, initial_config, involved_classes, test_entry_id): + test_entry = { + "initial_config": eval(initial_config), + "involved_classes": eval(involved_classes), + "id": test_entry_id, } - test_entry = test_entry - test_category = "" - model_name = "" + test_category = "" # following the bfcl_eval's original eval code. + model_name = "" # following the bfcl_eval's original eval code. multi_turn_ground_truth_list = eval(target_text) - + accuracy_checker_result = multi_turn_checker( answer_text, multi_turn_ground_truth_list, @@ -36,16 +40,18 @@ def __evaluate__(self, answer_text, target_text,initial_config,involved_classes, test_category, model_name, ) - return str(accuracy_checker_result['valid']) + return str(accuracy_checker_result["valid"]) def evaluate(self, data): self.validate_data(data) data[self.__class__.__name__ + "_result"] = data.apply( - lambda x: self.__evaluate__(x[self.model_output_col], - x[self.ground_truth_col], - x[self.initial_config_col], - x[self.involved_classes_col], - x[self.test_entry_id_col], - ), axis=1 + lambda x: self.__evaluate__( + x[self.model_output_col], + x[self.ground_truth_col], + x[self.initial_config_col], + x[self.involved_classes_col], + x[self.test_entry_id_col], + ), + axis=1, ) return data From 0469c8f3600123e512e45633a88752ecfccf1de8 Mon Sep 17 00:00:00 2001 From: lchen001 Date: Tue, 14 Oct 2025 01:03:10 +0000 Subject: [PATCH 3/4] update setup for bfcl_eval --- setup.py | 86 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/setup.py b/setup.py index 80fa73b6..1e6c5c98 100644 --- a/setup.py +++ b/setup.py @@ -1,53 +1,53 @@ -from setuptools import setup, find_packages - +from setuptools import find_packages, setup setup( - name='eureka_ml_insights', - version='0.1.0', - author='Microsoft Research', - author_email='eureka-ml-insights@microsoft.com', - description='Eureka ML Insights Framework', - long_description=open('README.md', encoding="utf-8").read(), - long_description_content_type='text/markdown', - url='https://github.com/microsoft/eureka-ml-insights', + name="eureka_ml_insights", + version="0.1.0", + author="Microsoft Research", + author_email="eureka-ml-insights@microsoft.com", + description="Eureka ML Insights Framework", + long_description=open("README.md", encoding="utf-8").read(), + long_description_content_type="text/markdown", + url="https://github.com/microsoft/eureka-ml-insights", packages=find_packages(), include_package_data=True, install_requires=[ - 'anthropic>=0.49.0', - 'azure-ai-textanalytics>=5.3.0', - 'azure-core>=1.29.5', - 'azure-keyvault-secrets>=4.8.0', - 'azure-identity>=1.16.0', - 'azure-storage-blob>=12.20.0', - 'datasets>=3.2.0', - 'fuzzywuzzy>=0.18.0', - 'jsonlines>=2.0.0', - 'pandas>=2.2.1', - 'pillow>=10.0.1', - 'torch>=2.6.0', - 'numpy==1.26.4', - 'tqdm>=4.65.0', - 'jinja2>=3.1.3', - 'transformers>=4.40.2', - 'immutabledict>=4.2.0', - 'langdetect>=1.0.9', - 'nltk>=3.9.1', - 'absl-py>=2.1.0', - 'tiktoken>=0.8.0', - 'python-levenshtein>=0.12.2', - 'google-generativeai>=0.7.0', - 'openai>=1.35.5', - 'bitsandbytes>=0.42.0', - 'accelerate>=0.21.0', - 'pycocotools>=2.0.8', - 'vllm>=0.8.0', - 'latex2sympy2>=1.9.1', + "anthropic>=0.49.0", + "azure-ai-textanalytics>=5.3.0", + "azure-core>=1.29.5", + "azure-keyvault-secrets>=4.8.0", + "azure-identity>=1.16.0", + "azure-storage-blob>=12.20.0", + "datasets>=3.2.0", + "fuzzywuzzy>=0.18.0", + "jsonlines>=2.0.0", + "pandas>=2.2.1", + "pillow>=10.0.1", + "torch>=2.6.0", + "numpy==1.26.4", + "tqdm>=4.65.0", + "jinja2>=3.1.3", + "transformers>=4.40.2", + "immutabledict>=4.2.0", + "langdetect>=1.0.9", + "nltk>=3.9.1", + "absl-py>=2.1.0", + "tiktoken>=0.8.0", + "python-levenshtein>=0.12.2", + "google-generativeai>=0.7.0", + "openai>=1.35.5", + "bitsandbytes>=0.42.0", + "accelerate>=0.21.0", + "pycocotools>=2.0.8", + "vllm>=0.8.0", + "latex2sympy2>=1.9.1", + "bfcl-eval>=2025.10.13", ], classifiers=[ # Full list at https://pypi.org/classifiers/ - 'Programming Language :: Python :: 3', - 'License :: Apache License 2.0', - 'Operating System :: OS Independent', + "Programming Language :: Python :: 3", + "License :: Apache License 2.0", + "Operating System :: OS Independent", ], - python_requires='>=3.8', + python_requires=">=3.8", ) From a65926e305a218ba1b3aa5e94d88bce0e3b0deb9 Mon Sep 17 00:00:00 2001 From: lchen001 Date: Wed, 29 Oct 2025 18:42:13 +0000 Subject: [PATCH 4/4] incorporate feedback --- environment.yml | 1 + .../data_utils/bfcl_multiturn_utils.py | 12 +++++++++--- setup.py | 18 +++++++++--------- 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/environment.yml b/environment.yml index dbb1a604..b8575a8d 100644 --- a/environment.yml +++ b/environment.yml @@ -67,4 +67,5 @@ dependencies: - pycocotools>=2.0.8 - vllm>=0.8.5 - latex2sympy2>=1.9.1 + - bfcl-eval>=2025.10.13 prefix: /home/sayouse/miniconda3/envs/myenv \ No newline at end of file diff --git a/eureka_ml_insights/data_utils/bfcl_multiturn_utils.py b/eureka_ml_insights/data_utils/bfcl_multiturn_utils.py index 4e82a5b3..a8e9f8b1 100644 --- a/eureka_ml_insights/data_utils/bfcl_multiturn_utils.py +++ b/eureka_ml_insights/data_utils/bfcl_multiturn_utils.py @@ -1,4 +1,4 @@ -import re +import re, ast from dataclasses import dataclass import pandas as pd @@ -30,8 +30,14 @@ def execuate_model_output(df_row): """ test_entry = df_row response_text = test_entry["model_output"] - initial_config: dict = eval(test_entry["initial_config"]) - involved_classes: list = eval(test_entry["involved_classes"]) + try: + initial_config = ast.literal_eval(test_entry["initial_config"]) + except (ValueError, SyntaxError) as e: + raise ValueError(f"Invalid initial_config format: {test_entry['initial_config']}") from e + try: + involved_classes: list = eval(test_entry["involved_classes"]) + except (ValueError, SyntaxError) as e: + raise ValueError(f"Invalid involved_classes format: {test_entry['involved_classes']}") from e test_entry_id: str = test_entry["id"] test_category: str = test_entry_id.rsplit("_", 1)[0] diff --git a/setup.py b/setup.py index 650bb5e9..0f1c7a55 100644 --- a/setup.py +++ b/setup.py @@ -1,14 +1,14 @@ from setuptools import find_packages, setup setup( - name="eureka_ml_insights", - version="0.1.0", - author="Microsoft Research", - author_email="eureka-ml-insights@microsoft.com", - description="Eureka ML Insights Framework", - long_description=open("README.md", encoding="utf-8").read(), - long_description_content_type="text/markdown", - url="https://github.com/microsoft/eureka-ml-insights", + name='eureka_ml_insights', + version='0.1.0', + author='Microsoft Research', + author_email='eureka-ml-insights@microsoft.com', + description='Eureka ML Insights Framework', + long_description=open('README.md', encoding="utf-8").read(), + long_description_content_type='text/markdown', + url='https://github.com/microsoft/eureka-ml-insights', packages=find_packages(), include_package_data=True, install_requires=[ @@ -41,7 +41,7 @@ 'pycocotools>=2.0.8', 'vllm>=0.8.5', 'latex2sympy2>=1.9.1', - "bfcl-eval>=2025.10.13", + 'bfcl-eval>=2025.10.13', ], extras_require={ 'llamacpp': [