opendatalab · e06084 · Jul 31, 2025 · Jul 31, 2025
diff --git a/.gitignore b/.gitignore
@@ -45,5 +45,4 @@ output/
 .coverage*
 coverage.xml
 
-webmainbench.egg-info/*
-results/*
+webmainbench.egg-info/*
diff --git a/results/llm_webkit_evaluation_report.csv b/results/llm_webkit_evaluation_report.csv
@@ -0,0 +1,2 @@
+extractor,total_samples,success_rate,overall,code_edit,formula_edit,table_TEDS,table_edit,text_edit
+llm-webkit,3,1.0,0.8221,0.8293,0.7076,1.0,0.963,0.6106
diff --git a/results/llm_webkit_evaluation_results.json b/results/llm_webkit_evaluation_results.json
@@ -0,0 +1,327 @@
+{
+  "metadata": {
+    "dataset_name": "llm_webkit_test",
+    "extractor_name": "llm-webkit",
+    "timestamp": "2025-07-31T13:52:12.948959",
+    "total_samples": 3
+  },
+  "overall_metrics": {
+    "code_edit": 0.8293333333333334,
+    "formula_edit": 0.7076023391812866,
+    "table_edit": 0.9629629629629629,
+    "table_TEDS": 1.0,
+    "text_edit": 0.6105951152390782,
+    "overall": 0.8220987501433322
+  },
+  "sample_results": [
+    {
+      "sample_id": "text_code_sample",
+      "extraction_success": true,
+      "extraction_time": 3.6406631469726562,
+      "metrics": {
+        "code_edit": {
+          "score": 0.488,
+          "success": true,
+          "details": {
+            "distance": 64,
+            "predicted_length": 125,
+            "groundtruth_length": 61,
+            "normalized": true,
+            "predicted_code_length": 125,
+            "groundtruth_code_length": 61,
+            "content_type": "code"
+          }
+        },
+        "formula_edit": {
+          "score": 1.0,
+          "success": true,
+          "details": {
+            "distance": 0,
+            "predicted_length": 0,
+            "groundtruth_length": 0,
+            "normalized": true,
+            "predicted_formula_length": 0,
+            "groundtruth_formula_length": 0,
+            "content_type": "formula"
+          }
+        },
+        "table_edit": {
+          "score": 1.0,
+          "success": true,
+          "details": {
+            "distance": 0,
+            "predicted_length": 0,
+            "groundtruth_length": 0,
+            "normalized": true,
+            "predicted_table_length": 0,
+            "groundtruth_table_length": 0,
+            "content_type": "table"
+          }
+        },
+        "table_TEDS": {
+          "score": 1.0,
+          "success": true,
+          "details": {
+            "edit_distance": 0.0,
+            "predicted_nodes": 3,
+            "groundtruth_nodes": 3,
+            "max_nodes": 3,
+            "structure_only": false,
+            "algorithm": "TEDS",
+            "content_type": "table"
+          }
+        },
+        "text_edit": {
+          "score": 0.9298245614035088,
+          "success": true,
+          "details": {
+            "distance": 4,
+            "predicted_length": 57,
+            "groundtruth_length": 53,
+            "normalized": true,
+            "predicted_text_length": 57,
+            "groundtruth_text_length": 53,
+            "content_type": "text"
+          }
+        },
+        "overall": {
+          "score": 0.8835649122807018,
+          "success": true,
+          "details": {
+            "source": "average_of_all_metrics",
+            "description": "Overall score as average of all successful metrics",
+            "successful_metrics": 5,
+            "failed_metrics": 0,
+            "individual_scores": {
+              "code_edit": 0.488,
+              "formula_edit": 1.0,
+              "table_edit": 1.0,
+              "table_TEDS": 1.0,
+              "text_edit": 0.9298245614035088
+            }
+          }
+        }
+      },
+      "sample_metadata": {
+        "url": null,
+        "domain": null,
+        "language": null,
+        "content_type": null,
+        "difficulty": null
+      }
+    },
+    {
+      "sample_id": "table_sample",
+      "extraction_success": true,
+      "extraction_time": 1.6590700149536133,
+      "metrics": {
+        "code_edit": {
+          "score": 1.0,
+          "success": true,
+          "details": {
+            "distance": 0,
+            "predicted_length": 0,
+            "groundtruth_length": 0,
+            "normalized": true,
+            "predicted_code_length": 0,
+            "groundtruth_code_length": 0,
+            "content_type": "code"
+          }
+        },
+        "formula_edit": {
+          "score": 1.0,
+          "success": true,
+          "details": {
+            "distance": 0,
+            "predicted_length": 0,
+            "groundtruth_length": 0,
+            "normalized": true,
+            "predicted_formula_length": 0,
+            "groundtruth_formula_length": 0,
+            "content_type": "formula"
+          }
+        },
+        "table_edit": {
+          "score": 0.8888888888888888,
+          "success": true,
+          "details": {
+            "distance": 9,
+            "predicted_length": 72,
+            "groundtruth_length": 81,
+            "normalized": true,
+            "predicted_table_length": 72,
+            "groundtruth_table_length": 81,
+            "content_type": "table"
+          }
+        },
+        "table_TEDS": {
+          "score": 1.0,
+          "success": true,
+          "details": {
+            "edit_distance": 0.0,
+            "predicted_nodes": 13,
+            "groundtruth_nodes": 13,
+            "max_nodes": 13,
+            "structure_only": false,
+            "algorithm": "TEDS",
+            "content_type": "table"
+          }
+        },
+        "text_edit": {
+          "score": 0.6666666666666667,
+          "success": true,
+          "details": {
+            "distance": 3,
+            "predicted_length": 9,
+            "groundtruth_length": 6,
+            "normalized": true,
+            "predicted_text_length": 9,
+            "groundtruth_text_length": 6,
+            "content_type": "text"
+          }
+        },
+        "overall": {
+          "score": 0.9111111111111111,
+          "success": true,
+          "details": {
+            "source": "average_of_all_metrics",
+            "description": "Overall score as average of all successful metrics",
+            "successful_metrics": 5,
+            "failed_metrics": 0,
+            "individual_scores": {
+              "code_edit": 1.0,
+              "formula_edit": 1.0,
+              "table_edit": 0.8888888888888888,
+              "table_TEDS": 1.0,
+              "text_edit": 0.6666666666666667
+            }
+          }
+        }
+      },
+      "sample_metadata": {
+        "url": null,
+        "domain": null,
+        "language": null,
+        "content_type": null,
+        "difficulty": null
+      }
+    },
+    {
+      "sample_id": "formula_sample",
+      "extraction_success": true,
+      "extraction_time": 1.5354089736938477,
+      "metrics": {
+        "code_edit": {
+          "score": 1.0,
+          "success": true,
+          "details": {
+            "distance": 0,
+            "predicted_length": 0,
+            "groundtruth_length": 0,
+            "normalized": true,
+            "predicted_code_length": 0,
+            "groundtruth_code_length": 0,
+            "content_type": "code"
+          }
+        },
+        "formula_edit": {
+          "score": 0.1228070175438597,
+          "success": true,
+          "details": {
+            "distance": 50,
+            "predicted_length": 9,
+            "groundtruth_length": 57,
+            "normalized": true,
+            "predicted_formula_length": 9,
+            "groundtruth_formula_length": 57,
+            "content_type": "formula"
+          }
+        },
+        "table_edit": {
+          "score": 1.0,
+          "success": true,
+          "details": {
+            "distance": 0,
+            "predicted_length": 0,
+            "groundtruth_length": 0,
+            "normalized": true,
+            "predicted_table_length": 0,
+            "groundtruth_table_length": 0,
+            "content_type": "table"
+          }
+        },
+        "table_TEDS": {
+          "score": 1.0,
+          "success": true,
+          "details": {
+            "edit_distance": 0.0,
+            "predicted_nodes": 3,
+            "groundtruth_nodes": 3,
+            "max_nodes": 3,
+            "structure_only": false,
+            "algorithm": "TEDS",
+            "content_type": "table"
+          }
+        },
+        "text_edit": {
+          "score": 0.23529411764705888,
+          "success": true,
+          "details": {
+            "distance": 65,
+            "predicted_length": 85,
+            "groundtruth_length": 37,
+            "normalized": true,
+            "predicted_text_length": 85,
+            "groundtruth_text_length": 37,
+            "content_type": "text"
+          }
+        },
+        "overall": {
+          "score": 0.6716202270381837,
+          "success": true,
+          "details": {
+            "source": "average_of_all_metrics",
+            "description": "Overall score as average of all successful metrics",
+            "successful_metrics": 5,
+            "failed_metrics": 0,
+            "individual_scores": {
+              "code_edit": 1.0,
+              "formula_edit": 0.1228070175438597,
+              "table_edit": 1.0,
+              "table_TEDS": 1.0,
+              "text_edit": 0.23529411764705888
+            }
+          }
+        }
+      },
+      "sample_metadata": {
+        "url": null,
+        "domain": null,
+        "language": null,
+        "content_type": null,
+        "difficulty": null
+      }
+    }
+  ],
+  "category_metrics": {
+    "unknown": {
+      "code_edit": 0.8293333333333334,
+      "formula_edit": 0.7076023391812866,
+      "table_edit": 0.9629629629629629,
+      "table_TEDS": 1.0,
+      "text_edit": 0.6105951152390782,
+      "overall": 0.8220987501433322
+    }
+  },
+  "error_analysis": {
+    "total_samples": 3,
+    "failed_count": 0,
+    "success_rate": 1.0,
+    "common_errors": {},
+    "sample_errors": []
+  },
+  "extractor_config": {
+    "model_path": "/Users/chupei/model/checkpoint-3296"
+  },
+  "metric_config": {}
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		extractor,total_samples,success_rate,overall,code_edit,formula_edit,table_TEDS,table_edit,text_edit
		llm-webkit,3,1.0,0.8221,0.8293,0.7076,1.0,0.963,0.6106