RUCKBReasoning · yangluoshen · Oct 30, 2025 · Oct 31, 2025 · Nov 12, 2025 · Nov 16, 2025
diff --git a/.gitignore b/.gitignore
@@ -5,8 +5,11 @@ __pycache__
 data/sample_data_200
 data/all_data_912
 data/all_data_912_v0.1
+data/test_data_1
 
 inference/outputs/*
 inference/log/*
 
-outputs
+outputs
+
+.venv
diff --git a/evaluation/evaluation.py b/evaluation/evaluation.py
@@ -133,8 +133,8 @@ def cell_level_compare(wb_gt, wb_proc, sheet_name, cell_range):
         cell_proc = ws_proc[cell_name]
 
         if not compare_cell_value(cell_gt.value, cell_proc.value):
-            msg = f"Value difference at cell {cell_gt.coordinate}: ws_gt has {cell_gt.value},\
-                    ws_proc has {cell_proc.value}"
+            msg = f"""Value difference at [{sheet_name}!{cell_gt.coordinate}]:
+            answer <-> processed: [{cell_gt.value}] <-> [{cell_proc.value}], type({type(cell_gt.value)}) <-> type({type(cell_proc.value)})"""
             return False, msg
 
         # if not compare_fill_color(cell_gt.fill, cell_proc.fill):
@@ -185,15 +185,15 @@ def compare_workbooks(gt_file, proc_file, instruction_type, answer_position):
         result_list.append(result)
         msg_list.append(msg)
 
-    return all(result_list), ""
+    return all(result_list), msg_list
 
 
 def parse_option():
     parser = argparse.ArgumentParser("command line arguments for evaluation.")
 
     parser.add_argument('--model', type=str, default='llama', help='model name')
     parser.add_argument('--setting', type=str, default='single',
-        help='four setting: single, multi_react_exec, multi_row_exec, multi_row_react_exec')
+        help='four setting: single, multi_react_exec, multi_row_exec, multi_row_react_exec, univer')
     parser.add_argument('--dataset', type=str, default="all_data_912", help='dataset name')
 
     opt = parser.parse_args()
@@ -202,21 +202,35 @@ def parse_option():
 
 
 def evaluation(opt):
-    dataset_path = os.path.abspath(f'../data/{opt.dataset}')
-    with open(f'{dataset_path}/dataset.json', 'r') as fp:
+    # Get the directory where this script is located
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    # Navigate to the data directory relative to the script location
+    dataset_path = os.path.join(script_dir, '..', 'data', opt.dataset)
+    dataset_path = os.path.abspath(dataset_path)
+
+    dataset_json_path = os.path.join(dataset_path, 'dataset.json')
+    with open(dataset_json_path, 'r', encoding='utf-8') as fp:
         dataset = json.load(fp)
 
     eval_results = []
     for data in tqdm(dataset):
         test_case_results = []
-        for test_case_idx in range(3):
-            gt_path = f"{dataset_path}/spreadsheet/{data['id']}/{test_case_idx + 1}_{data['id']}_answer.xlsx"
-            proc_path = f"{dataset_path}/spreadsheet/{data['id']}/{test_case_idx + 1}_{data['id']}_input.xlsx"
-            # proc_path = f"{dataset_path}/outputs/{opt.setting}_{opt.model}/{test_case_idx + 1}_{data['id']}_output.xlsx"
+        msg_list = []
+        for test_case_idx in range(1): # evaluate only the first test case
+            answer_filename = f"{test_case_idx + 1}_{data['id']}_answer.xlsx"
+            gt_path = os.path.join(dataset_path, 'spreadsheet', str(data['id']), answer_filename)
+
+            # proc_path = os.path.join(dataset_path, 'spreadsheet', str(data['id']), f"{test_case_idx + 1}_{data['id']}_input.xlsx")
+            output_filename = f"{test_case_idx + 1}_{data['id']}_output.xlsx"
+            output_dir = f"{opt.setting}_{opt.model}"
+            proc_path = os.path.join(dataset_path, 'outputs', output_dir, output_filename)
+
             try:
-                result, _ = compare_workbooks(gt_path, proc_path, data['instruction_type'], data['answer_position'])
-            except:
+                result, msg = compare_workbooks(gt_path, proc_path, data['instruction_type'], data['answer_position'])
+                msg_list.append(msg)
+            except Exception as e:
                 result = False
+                msg_list.append(f"Error: {str(e)}")
             test_case_results.append(int(result))
         soft_restriction = test_case_results.count(1) / len(test_case_results)
         hard_restriction = 0 if 0 in test_case_results else 1
@@ -226,9 +240,12 @@ def evaluation(opt):
             'test_case_results': test_case_results,
             'soft_restriction': soft_restriction,
             'hard_restriction': hard_restriction,
+            'msg': msg_list,
         })
 
-    with open(f'../outputs/eval_{opt.setting}_{opt.model}.json', 'w') as fp:
+    eval_filename = f"eval_{opt.setting}_{opt.model}.json"
+    eval_path = os.path.join(dataset_path, 'outputs', eval_filename)
+    with open(eval_path, 'w', encoding='utf-8') as fp:
         json.dump(eval_results, fp, indent=4)