From 682b1cc814a2b081a63cb665e8a33595b61d8e9a Mon Sep 17 00:00:00 2001
From: lchen001 <lingjiao@stanford.edu>
Date: Mon, 4 Aug 2025 23:49:07 +0000
Subject: [PATCH 1/4] add bfcl utility

---
 .../data_utils/bfcl_multiturn_utils.py        | 53 +++++++++++++++++++
 eureka_ml_insights/metrics/bfcl_metrics.py    | 51 ++++++++++++++++++
 2 files changed, 104 insertions(+)
 create mode 100644 eureka_ml_insights/data_utils/bfcl_multiturn_utils.py
 create mode 100644 eureka_ml_insights/metrics/bfcl_metrics.py

diff --git a/eureka_ml_insights/data_utils/bfcl_multiturn_utils.py b/eureka_ml_insights/data_utils/bfcl_multiturn_utils.py
new file mode 100644
index 00000000..1cb2e87a
--- /dev/null
+++ b/eureka_ml_insights/data_utils/bfcl_multiturn_utils.py
@@ -0,0 +1,53 @@
+import re, json, ast
+from dataclasses import dataclass
+
+import pandas as pd
+
+from .transform import DFTransformBase
+
+from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import (
+    execute_multi_turn_func_call
+)
+
+@dataclass
+class BFCLMultiturnExecuteCall(DFTransformBase):
+    model_output_column: str
+    model_answer_column: str
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        df[self.model_answer_column] = df.apply(self.execuate_model_output,axis=1)
+        return df
+
+    @staticmethod
+    def execuate_model_output(response):
+        """
+        Execute the model output to get the function output.
+             
+        Parameters:
+            response (str): Input string containing answer X in the form of "Final Answer: X".
+        Returns:
+            numerical_value (float or str): A numeric value or JSON string representing the model's answer.
+        """
+        test_entry = response
+        response_text = test_entry["model_output"]
+        initial_config: dict = eval(test_entry["initial_config"])
+        involved_classes: list = eval(test_entry["involved_classes"])
+        test_entry_id: str = test_entry["id"]
+        test_category: str = test_entry_id.rsplit("_", 1)[0]
+
+        func_calls = re.findall(r'\w+\([^)]*\)', response_text)
+        if(len(func_calls)==0):
+            return "No call executed"
+        
+        execution_results, involved_instances = execute_multi_turn_func_call(
+        func_call_list = func_calls, 
+        initial_config = initial_config,
+        involved_classes = involved_classes,
+        model_name = "",
+        test_entry_id=test_entry_id,
+        long_context = (
+                        "long_context" in test_category or "composite" in test_category
+                    ),
+        is_evaL_run=False,
+        )
+        return " ".join(execution_results)
diff --git a/eureka_ml_insights/metrics/bfcl_metrics.py b/eureka_ml_insights/metrics/bfcl_metrics.py
new file mode 100644
index 00000000..a210b636
--- /dev/null
+++ b/eureka_ml_insights/metrics/bfcl_metrics.py
@@ -0,0 +1,51 @@
+from eureka_ml_insights.metrics.metrics_base import ClassicMetric
+
+from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_checker import (
+    multi_turn_checker)
+
+class BFCLMultiturnMatch(ClassicMetric):
+    """This metric class checks if two dictionary strings represent the same dictionary."""
+
+    def __init__(self, model_output_col: str = "model_output",
+                 ground_truth_col: str = "ground_truth", 
+                 initial_config_col:str = "initial_config",
+                 involved_classes_col: str = "involved_classes",
+                 test_entry_id_col: str = "id",
+                 ):
+        super().__init__()
+        self.model_output_col = model_output_col
+        self.ground_truth_col = ground_truth_col
+        self.initial_config_col = initial_config_col
+        self.involved_classes_col = involved_classes_col
+        self.test_entry_id_col = test_entry_id_col
+
+    def __evaluate__(self, answer_text, target_text,initial_config,involved_classes,test_entry_id):
+        test_entry = {"initial_config":eval(initial_config),
+                        "involved_classes":eval(involved_classes),
+                        "id":test_entry_id,
+        }
+        test_entry = test_entry
+        test_category = "" 
+        model_name = ""
+        multi_turn_ground_truth_list = eval(target_text)
+        
+        accuracy_checker_result = multi_turn_checker(
+            answer_text,
+            multi_turn_ground_truth_list,
+            test_entry,
+            test_category,
+            model_name,
+        )
+        return str(accuracy_checker_result['valid'])
+
+    def evaluate(self, data):
+        self.validate_data(data)
+        data[self.__class__.__name__ + "_result"] = data.apply(
+            lambda x: self.__evaluate__(x[self.model_output_col], 
+                                        x[self.ground_truth_col], 
+                                        x[self.initial_config_col],
+                                        x[self.involved_classes_col],
+                                        x[self.test_entry_id_col],
+                                        ), axis=1
+        )
+        return data

From 1f56767ccc1cc035bc1eaecf88f67b8b96581f63 Mon Sep 17 00:00:00 2001
From: lchen001 <lingjiao@stanford.edu>
Date: Tue, 14 Oct 2025 00:56:58 +0000
Subject: [PATCH 2/4] update the doc strings and add some comments

---
 .../data_utils/bfcl_multiturn_utils.py        | 42 +++++++--------
 eureka_ml_insights/metrics/bfcl_metrics.py    | 54 ++++++++++---------
 2 files changed, 50 insertions(+), 46 deletions(-)

diff --git a/eureka_ml_insights/data_utils/bfcl_multiturn_utils.py b/eureka_ml_insights/data_utils/bfcl_multiturn_utils.py
index 1cb2e87a..4e82a5b3 100644
--- a/eureka_ml_insights/data_utils/bfcl_multiturn_utils.py
+++ b/eureka_ml_insights/data_utils/bfcl_multiturn_utils.py
@@ -1,13 +1,13 @@
-import re, json, ast
+import re
 from dataclasses import dataclass
 
 import pandas as pd
+from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import (
+    execute_multi_turn_func_call,
+)
 
 from .transform import DFTransformBase
 
-from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import (
-    execute_multi_turn_func_call
-)
 
 @dataclass
 class BFCLMultiturnExecuteCall(DFTransformBase):
@@ -15,39 +15,37 @@ class BFCLMultiturnExecuteCall(DFTransformBase):
     model_answer_column: str
 
     def transform(self, df: pd.DataFrame) -> pd.DataFrame:
-        df[self.model_answer_column] = df.apply(self.execuate_model_output,axis=1)
+        df[self.model_answer_column] = df.apply(self.execuate_model_output, axis=1)
         return df
 
     @staticmethod
-    def execuate_model_output(response):
+    def execuate_model_output(df_row):
         """
         Execute the model output to get the function output.
-             
+
         Parameters:
-            response (str): Input string containing answer X in the form of "Final Answer: X".
+            df_row (a row in dataframe): Input is a dataframe row containing model_output, initial_config, and, involved_classes.
         Returns:
-            numerical_value (float or str): A numeric value or JSON string representing the model's answer.
+            " ".join(execution_results) (str): A string denoting the execution of the function calls.
         """
-        test_entry = response
+        test_entry = df_row
         response_text = test_entry["model_output"]
         initial_config: dict = eval(test_entry["initial_config"])
         involved_classes: list = eval(test_entry["involved_classes"])
         test_entry_id: str = test_entry["id"]
         test_category: str = test_entry_id.rsplit("_", 1)[0]
 
-        func_calls = re.findall(r'\w+\([^)]*\)', response_text)
-        if(len(func_calls)==0):
+        func_calls = re.findall(r"\w+\([^)]*\)", response_text)
+        if len(func_calls) == 0:
             return "No call executed"
-        
+
         execution_results, involved_instances = execute_multi_turn_func_call(
-        func_call_list = func_calls, 
-        initial_config = initial_config,
-        involved_classes = involved_classes,
-        model_name = "",
-        test_entry_id=test_entry_id,
-        long_context = (
-                        "long_context" in test_category or "composite" in test_category
-                    ),
-        is_evaL_run=False,
+            func_call_list=func_calls,
+            initial_config=initial_config,
+            involved_classes=involved_classes,
+            model_name="",
+            test_entry_id=test_entry_id,
+            long_context=("long_context" in test_category or "composite" in test_category),
+            is_evaL_run=False,
         )
         return " ".join(execution_results)
diff --git a/eureka_ml_insights/metrics/bfcl_metrics.py b/eureka_ml_insights/metrics/bfcl_metrics.py
index a210b636..f76a359d 100644
--- a/eureka_ml_insights/metrics/bfcl_metrics.py
+++ b/eureka_ml_insights/metrics/bfcl_metrics.py
@@ -1,17 +1,21 @@
+from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_checker import (
+    multi_turn_checker,
+)
+
 from eureka_ml_insights.metrics.metrics_base import ClassicMetric
 
-from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_checker import (
-    multi_turn_checker)
 
 class BFCLMultiturnMatch(ClassicMetric):
-    """This metric class checks if two dictionary strings represent the same dictionary."""
+    """This metric class checks if the generated function calls match the ground-truth."""
 
-    def __init__(self, model_output_col: str = "model_output",
-                 ground_truth_col: str = "ground_truth", 
-                 initial_config_col:str = "initial_config",
-                 involved_classes_col: str = "involved_classes",
-                 test_entry_id_col: str = "id",
-                 ):
+    def __init__(
+        self,
+        model_output_col: str = "model_output",
+        ground_truth_col: str = "ground_truth",
+        initial_config_col: str = "initial_config",
+        involved_classes_col: str = "involved_classes",
+        test_entry_id_col: str = "id",
+    ):
         super().__init__()
         self.model_output_col = model_output_col
         self.ground_truth_col = ground_truth_col
@@ -19,16 +23,16 @@ def __init__(self, model_output_col: str = "model_output",
         self.involved_classes_col = involved_classes_col
         self.test_entry_id_col = test_entry_id_col
 
-    def __evaluate__(self, answer_text, target_text,initial_config,involved_classes,test_entry_id):
-        test_entry = {"initial_config":eval(initial_config),
-                        "involved_classes":eval(involved_classes),
-                        "id":test_entry_id,
+    def __evaluate__(self, answer_text, target_text, initial_config, involved_classes, test_entry_id):
+        test_entry = {
+            "initial_config": eval(initial_config),
+            "involved_classes": eval(involved_classes),
+            "id": test_entry_id,
         }
-        test_entry = test_entry
-        test_category = "" 
-        model_name = ""
+        test_category = ""  # following the bfcl_eval's original eval code.
+        model_name = ""  # following the bfcl_eval's original eval code.
         multi_turn_ground_truth_list = eval(target_text)
-        
+
         accuracy_checker_result = multi_turn_checker(
             answer_text,
             multi_turn_ground_truth_list,
@@ -36,16 +40,18 @@ def __evaluate__(self, answer_text, target_text,initial_config,involved_classes,
             test_category,
             model_name,
         )
-        return str(accuracy_checker_result['valid'])
+        return str(accuracy_checker_result["valid"])
 
     def evaluate(self, data):
         self.validate_data(data)
         data[self.__class__.__name__ + "_result"] = data.apply(
-            lambda x: self.__evaluate__(x[self.model_output_col], 
-                                        x[self.ground_truth_col], 
-                                        x[self.initial_config_col],
-                                        x[self.involved_classes_col],
-                                        x[self.test_entry_id_col],
-                                        ), axis=1
+            lambda x: self.__evaluate__(
+                x[self.model_output_col],
+                x[self.ground_truth_col],
+                x[self.initial_config_col],
+                x[self.involved_classes_col],
+                x[self.test_entry_id_col],
+            ),
+            axis=1,
         )
         return data

From 0469c8f3600123e512e45633a88752ecfccf1de8 Mon Sep 17 00:00:00 2001
From: lchen001 <lingjiao@stanford.edu>
Date: Tue, 14 Oct 2025 01:03:10 +0000
Subject: [PATCH 3/4] update setup for bfcl_eval

---
 setup.py | 86 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 43 insertions(+), 43 deletions(-)

diff --git a/setup.py b/setup.py
index 80fa73b6..1e6c5c98 100644
--- a/setup.py
+++ b/setup.py
@@ -1,53 +1,53 @@
-from setuptools import setup, find_packages
-
+from setuptools import find_packages, setup
 
 setup(
-    name='eureka_ml_insights',
-    version='0.1.0',
-    author='Microsoft Research',
-    author_email='eureka-ml-insights@microsoft.com',
-    description='Eureka ML Insights Framework',
-    long_description=open('README.md', encoding="utf-8").read(),
-    long_description_content_type='text/markdown',
-    url='https://github.com/microsoft/eureka-ml-insights',
+    name="eureka_ml_insights",
+    version="0.1.0",
+    author="Microsoft Research",
+    author_email="eureka-ml-insights@microsoft.com",
+    description="Eureka ML Insights Framework",
+    long_description=open("README.md", encoding="utf-8").read(),
+    long_description_content_type="text/markdown",
+    url="https://github.com/microsoft/eureka-ml-insights",
     packages=find_packages(),
     include_package_data=True,
     install_requires=[
-        'anthropic>=0.49.0',
-        'azure-ai-textanalytics>=5.3.0',
-        'azure-core>=1.29.5',
-        'azure-keyvault-secrets>=4.8.0',
-        'azure-identity>=1.16.0',
-        'azure-storage-blob>=12.20.0',
-        'datasets>=3.2.0',
-        'fuzzywuzzy>=0.18.0',
-        'jsonlines>=2.0.0',
-        'pandas>=2.2.1',
-        'pillow>=10.0.1',
-        'torch>=2.6.0',
-        'numpy==1.26.4',
-        'tqdm>=4.65.0',
-        'jinja2>=3.1.3',
-        'transformers>=4.40.2',
-        'immutabledict>=4.2.0',
-        'langdetect>=1.0.9',
-        'nltk>=3.9.1',
-        'absl-py>=2.1.0',
-        'tiktoken>=0.8.0',
-        'python-levenshtein>=0.12.2',
-        'google-generativeai>=0.7.0',
-        'openai>=1.35.5',
-        'bitsandbytes>=0.42.0',
-        'accelerate>=0.21.0',
-        'pycocotools>=2.0.8',
-        'vllm>=0.8.0',
-        'latex2sympy2>=1.9.1',
+        "anthropic>=0.49.0",
+        "azure-ai-textanalytics>=5.3.0",
+        "azure-core>=1.29.5",
+        "azure-keyvault-secrets>=4.8.0",
+        "azure-identity>=1.16.0",
+        "azure-storage-blob>=12.20.0",
+        "datasets>=3.2.0",
+        "fuzzywuzzy>=0.18.0",
+        "jsonlines>=2.0.0",
+        "pandas>=2.2.1",
+        "pillow>=10.0.1",
+        "torch>=2.6.0",
+        "numpy==1.26.4",
+        "tqdm>=4.65.0",
+        "jinja2>=3.1.3",
+        "transformers>=4.40.2",
+        "immutabledict>=4.2.0",
+        "langdetect>=1.0.9",
+        "nltk>=3.9.1",
+        "absl-py>=2.1.0",
+        "tiktoken>=0.8.0",
+        "python-levenshtein>=0.12.2",
+        "google-generativeai>=0.7.0",
+        "openai>=1.35.5",
+        "bitsandbytes>=0.42.0",
+        "accelerate>=0.21.0",
+        "pycocotools>=2.0.8",
+        "vllm>=0.8.0",
+        "latex2sympy2>=1.9.1",
+        "bfcl-eval>=2025.10.13",
     ],
     classifiers=[
         # Full list at https://pypi.org/classifiers/
-        'Programming Language :: Python :: 3',
-        'License :: Apache License 2.0',
-        'Operating System :: OS Independent',
+        "Programming Language :: Python :: 3",
+        "License :: Apache License 2.0",
+        "Operating System :: OS Independent",
     ],
-    python_requires='>=3.8',
+    python_requires=">=3.8",
 )

From a65926e305a218ba1b3aa5e94d88bce0e3b0deb9 Mon Sep 17 00:00:00 2001
From: lchen001 <lingjiao@stanford.edu>
Date: Wed, 29 Oct 2025 18:42:13 +0000
Subject: [PATCH 4/4] incorporate feedback

---
 environment.yml                                |  1 +
 .../data_utils/bfcl_multiturn_utils.py         | 12 +++++++++---
 setup.py                                       | 18 +++++++++---------
 3 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/environment.yml b/environment.yml
index dbb1a604..b8575a8d 100644
--- a/environment.yml
+++ b/environment.yml
@@ -67,4 +67,5 @@ dependencies:
       - pycocotools>=2.0.8
       - vllm>=0.8.5
       - latex2sympy2>=1.9.1
+      - bfcl-eval>=2025.10.13
 prefix: /home/sayouse/miniconda3/envs/myenv
\ No newline at end of file
diff --git a/eureka_ml_insights/data_utils/bfcl_multiturn_utils.py b/eureka_ml_insights/data_utils/bfcl_multiturn_utils.py
index 4e82a5b3..a8e9f8b1 100644
--- a/eureka_ml_insights/data_utils/bfcl_multiturn_utils.py
+++ b/eureka_ml_insights/data_utils/bfcl_multiturn_utils.py
@@ -1,4 +1,4 @@
-import re
+import re, ast
 from dataclasses import dataclass
 
 import pandas as pd
@@ -30,8 +30,14 @@ def execuate_model_output(df_row):
         """
         test_entry = df_row
         response_text = test_entry["model_output"]
-        initial_config: dict = eval(test_entry["initial_config"])
-        involved_classes: list = eval(test_entry["involved_classes"])
+        try:
+            initial_config = ast.literal_eval(test_entry["initial_config"])
+        except (ValueError, SyntaxError) as e:
+            raise ValueError(f"Invalid initial_config format: {test_entry['initial_config']}") from e
+        try:
+            involved_classes: list = eval(test_entry["involved_classes"])
+        except (ValueError, SyntaxError) as e:
+            raise ValueError(f"Invalid involved_classes format: {test_entry['involved_classes']}") from e
         test_entry_id: str = test_entry["id"]
         test_category: str = test_entry_id.rsplit("_", 1)[0]
 
diff --git a/setup.py b/setup.py
index 650bb5e9..0f1c7a55 100644
--- a/setup.py
+++ b/setup.py
@@ -1,14 +1,14 @@
 from setuptools import find_packages, setup
 
 setup(
-    name="eureka_ml_insights",
-    version="0.1.0",
-    author="Microsoft Research",
-    author_email="eureka-ml-insights@microsoft.com",
-    description="Eureka ML Insights Framework",
-    long_description=open("README.md", encoding="utf-8").read(),
-    long_description_content_type="text/markdown",
-    url="https://github.com/microsoft/eureka-ml-insights",
+    name='eureka_ml_insights',
+    version='0.1.0',
+    author='Microsoft Research',
+    author_email='eureka-ml-insights@microsoft.com',
+    description='Eureka ML Insights Framework',
+    long_description=open('README.md', encoding="utf-8").read(),
+    long_description_content_type='text/markdown',
+    url='https://github.com/microsoft/eureka-ml-insights',
     packages=find_packages(),
     include_package_data=True,
     install_requires=[
@@ -41,7 +41,7 @@
         'pycocotools>=2.0.8',
         'vllm>=0.8.5',
         'latex2sympy2>=1.9.1',
-        "bfcl-eval>=2025.10.13",
+        'bfcl-eval>=2025.10.13',
     ],
     extras_require={
         'llamacpp': [