microsoft · lchen001 · Aug 4, 2025 · Oct 14, 2025 · Oct 14, 2025 · Oct 14, 2025
diff --git a/eureka_ml_insights/data_utils/bfcl_multiturn_utils.py b/eureka_ml_insights/data_utils/bfcl_multiturn_utils.py
@@ -0,0 +1,53 @@
+import re, json, ast
+from dataclasses import dataclass
+
+import pandas as pd
+
+from .transform import DFTransformBase
+
+from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils import (
+    execute_multi_turn_func_call
+)
+
+@dataclass
+class BFCLMultiturnExecuteCall(DFTransformBase):
+    model_output_column: str
+    model_answer_column: str
+
+    def transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        df[self.model_answer_column] = df.apply(self.execuate_model_output,axis=1)
+        return df
+
+    @staticmethod
+    def execuate_model_output(response):
+        """
+        Execute the model output to get the function output.
+
+        Parameters:
+            response (str): Input string containing answer X in the form of "Final Answer: X".
+        Returns:
+            numerical_value (float or str): A numeric value or JSON string representing the model's answer.
+        """
+        test_entry = response
+        response_text = test_entry["model_output"]
+        initial_config: dict = eval(test_entry["initial_config"])
+        involved_classes: list = eval(test_entry["involved_classes"])
+        test_entry_id: str = test_entry["id"]
+        test_category: str = test_entry_id.rsplit("_", 1)[0]
+
+        func_calls = re.findall(r'\w+\([^)]*\)', response_text)
+        if(len(func_calls)==0):
+            return "No call executed"
+
+        execution_results, involved_instances = execute_multi_turn_func_call(
+        func_call_list = func_calls, 
+        initial_config = initial_config,
+        involved_classes = involved_classes,
+        model_name = "",
+        test_entry_id=test_entry_id,
+        long_context = (
+                        "long_context" in test_category or "composite" in test_category
+                    ),
+        is_evaL_run=False,
+        )
+        return " ".join(execution_results)
diff --git a/eureka_ml_insights/metrics/bfcl_metrics.py b/eureka_ml_insights/metrics/bfcl_metrics.py
@@ -0,0 +1,51 @@
+from eureka_ml_insights.metrics.metrics_base import ClassicMetric
+
+from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_checker import (
+    multi_turn_checker)
+
+class BFCLMultiturnMatch(ClassicMetric):
+    """This metric class checks if two dictionary strings represent the same dictionary."""
+
+    def __init__(self, model_output_col: str = "model_output",
+                 ground_truth_col: str = "ground_truth", 
+                 initial_config_col:str = "initial_config",
+                 involved_classes_col: str = "involved_classes",
+                 test_entry_id_col: str = "id",
+                 ):
+        super().__init__()
+        self.model_output_col = model_output_col
+        self.ground_truth_col = ground_truth_col
+        self.initial_config_col = initial_config_col
+        self.involved_classes_col = involved_classes_col
+        self.test_entry_id_col = test_entry_id_col
+
+    def __evaluate__(self, answer_text, target_text,initial_config,involved_classes,test_entry_id):
+        test_entry = {"initial_config":eval(initial_config),
+                        "involved_classes":eval(involved_classes),
+                        "id":test_entry_id,
+        }
+        test_entry = test_entry
+        test_category = "" 
+        model_name = ""
+        multi_turn_ground_truth_list = eval(target_text)
+
+        accuracy_checker_result = multi_turn_checker(
+            answer_text,
+            multi_turn_ground_truth_list,
+            test_entry,
+            test_category,
+            model_name,
+        )
+        return str(accuracy_checker_result['valid'])
+
+    def evaluate(self, data):
+        self.validate_data(data)
+        data[self.__class__.__name__ + "_result"] = data.apply(
+            lambda x: self.__evaluate__(x[self.model_output_col], 
+                                        x[self.ground_truth_col], 
+                                        x[self.initial_config_col],
+                                        x[self.involved_classes_col],
+                                        x[self.test_entry_id_col],
+                                        ), axis=1
+        )
+        return data