diff --git a/.gitignore b/.gitignore index de4c771..79998f1 100644 --- a/.gitignore +++ b/.gitignore @@ -45,5 +45,4 @@ output/ .coverage* coverage.xml -webmainbench.egg-info/* -results/* \ No newline at end of file +webmainbench.egg-info/* \ No newline at end of file diff --git a/results/llm_webkit_evaluation_report.csv b/results/llm_webkit_evaluation_report.csv new file mode 100644 index 0000000..bd23b5f --- /dev/null +++ b/results/llm_webkit_evaluation_report.csv @@ -0,0 +1,2 @@ +extractor,total_samples,success_rate,overall,code_edit,formula_edit,table_TEDS,table_edit,text_edit +llm-webkit,3,1.0,0.8221,0.8293,0.7076,1.0,0.963,0.6106 diff --git a/results/llm_webkit_evaluation_results.json b/results/llm_webkit_evaluation_results.json new file mode 100644 index 0000000..9839d38 --- /dev/null +++ b/results/llm_webkit_evaluation_results.json @@ -0,0 +1,327 @@ +{ + "metadata": { + "dataset_name": "llm_webkit_test", + "extractor_name": "llm-webkit", + "timestamp": "2025-07-31T13:52:12.948959", + "total_samples": 3 + }, + "overall_metrics": { + "code_edit": 0.8293333333333334, + "formula_edit": 0.7076023391812866, + "table_edit": 0.9629629629629629, + "table_TEDS": 1.0, + "text_edit": 0.6105951152390782, + "overall": 0.8220987501433322 + }, + "sample_results": [ + { + "sample_id": "text_code_sample", + "extraction_success": true, + "extraction_time": 3.6406631469726562, + "metrics": { + "code_edit": { + "score": 0.488, + "success": true, + "details": { + "distance": 64, + "predicted_length": 125, + "groundtruth_length": 61, + "normalized": true, + "predicted_code_length": 125, + "groundtruth_code_length": 61, + "content_type": "code" + } + }, + "formula_edit": { + "score": 1.0, + "success": true, + "details": { + "distance": 0, + "predicted_length": 0, + "groundtruth_length": 0, + "normalized": true, + "predicted_formula_length": 0, + "groundtruth_formula_length": 0, + "content_type": "formula" + } + }, + "table_edit": { + "score": 1.0, + "success": true, + "details": { + "distance": 0, + "predicted_length": 0, + "groundtruth_length": 0, + "normalized": true, + "predicted_table_length": 0, + "groundtruth_table_length": 0, + "content_type": "table" + } + }, + "table_TEDS": { + "score": 1.0, + "success": true, + "details": { + "edit_distance": 0.0, + "predicted_nodes": 3, + "groundtruth_nodes": 3, + "max_nodes": 3, + "structure_only": false, + "algorithm": "TEDS", + "content_type": "table" + } + }, + "text_edit": { + "score": 0.9298245614035088, + "success": true, + "details": { + "distance": 4, + "predicted_length": 57, + "groundtruth_length": 53, + "normalized": true, + "predicted_text_length": 57, + "groundtruth_text_length": 53, + "content_type": "text" + } + }, + "overall": { + "score": 0.8835649122807018, + "success": true, + "details": { + "source": "average_of_all_metrics", + "description": "Overall score as average of all successful metrics", + "successful_metrics": 5, + "failed_metrics": 0, + "individual_scores": { + "code_edit": 0.488, + "formula_edit": 1.0, + "table_edit": 1.0, + "table_TEDS": 1.0, + "text_edit": 0.9298245614035088 + } + } + } + }, + "sample_metadata": { + "url": null, + "domain": null, + "language": null, + "content_type": null, + "difficulty": null + } + }, + { + "sample_id": "table_sample", + "extraction_success": true, + "extraction_time": 1.6590700149536133, + "metrics": { + "code_edit": { + "score": 1.0, + "success": true, + "details": { + "distance": 0, + "predicted_length": 0, + "groundtruth_length": 0, + "normalized": true, + "predicted_code_length": 0, + "groundtruth_code_length": 0, + "content_type": "code" + } + }, + "formula_edit": { + "score": 1.0, + "success": true, + "details": { + "distance": 0, + "predicted_length": 0, + "groundtruth_length": 0, + "normalized": true, + "predicted_formula_length": 0, + "groundtruth_formula_length": 0, + "content_type": "formula" + } + }, + "table_edit": { + "score": 0.8888888888888888, + "success": true, + "details": { + "distance": 9, + "predicted_length": 72, + "groundtruth_length": 81, + "normalized": true, + "predicted_table_length": 72, + "groundtruth_table_length": 81, + "content_type": "table" + } + }, + "table_TEDS": { + "score": 1.0, + "success": true, + "details": { + "edit_distance": 0.0, + "predicted_nodes": 13, + "groundtruth_nodes": 13, + "max_nodes": 13, + "structure_only": false, + "algorithm": "TEDS", + "content_type": "table" + } + }, + "text_edit": { + "score": 0.6666666666666667, + "success": true, + "details": { + "distance": 3, + "predicted_length": 9, + "groundtruth_length": 6, + "normalized": true, + "predicted_text_length": 9, + "groundtruth_text_length": 6, + "content_type": "text" + } + }, + "overall": { + "score": 0.9111111111111111, + "success": true, + "details": { + "source": "average_of_all_metrics", + "description": "Overall score as average of all successful metrics", + "successful_metrics": 5, + "failed_metrics": 0, + "individual_scores": { + "code_edit": 1.0, + "formula_edit": 1.0, + "table_edit": 0.8888888888888888, + "table_TEDS": 1.0, + "text_edit": 0.6666666666666667 + } + } + } + }, + "sample_metadata": { + "url": null, + "domain": null, + "language": null, + "content_type": null, + "difficulty": null + } + }, + { + "sample_id": "formula_sample", + "extraction_success": true, + "extraction_time": 1.5354089736938477, + "metrics": { + "code_edit": { + "score": 1.0, + "success": true, + "details": { + "distance": 0, + "predicted_length": 0, + "groundtruth_length": 0, + "normalized": true, + "predicted_code_length": 0, + "groundtruth_code_length": 0, + "content_type": "code" + } + }, + "formula_edit": { + "score": 0.1228070175438597, + "success": true, + "details": { + "distance": 50, + "predicted_length": 9, + "groundtruth_length": 57, + "normalized": true, + "predicted_formula_length": 9, + "groundtruth_formula_length": 57, + "content_type": "formula" + } + }, + "table_edit": { + "score": 1.0, + "success": true, + "details": { + "distance": 0, + "predicted_length": 0, + "groundtruth_length": 0, + "normalized": true, + "predicted_table_length": 0, + "groundtruth_table_length": 0, + "content_type": "table" + } + }, + "table_TEDS": { + "score": 1.0, + "success": true, + "details": { + "edit_distance": 0.0, + "predicted_nodes": 3, + "groundtruth_nodes": 3, + "max_nodes": 3, + "structure_only": false, + "algorithm": "TEDS", + "content_type": "table" + } + }, + "text_edit": { + "score": 0.23529411764705888, + "success": true, + "details": { + "distance": 65, + "predicted_length": 85, + "groundtruth_length": 37, + "normalized": true, + "predicted_text_length": 85, + "groundtruth_text_length": 37, + "content_type": "text" + } + }, + "overall": { + "score": 0.6716202270381837, + "success": true, + "details": { + "source": "average_of_all_metrics", + "description": "Overall score as average of all successful metrics", + "successful_metrics": 5, + "failed_metrics": 0, + "individual_scores": { + "code_edit": 1.0, + "formula_edit": 0.1228070175438597, + "table_edit": 1.0, + "table_TEDS": 1.0, + "text_edit": 0.23529411764705888 + } + } + } + }, + "sample_metadata": { + "url": null, + "domain": null, + "language": null, + "content_type": null, + "difficulty": null + } + } + ], + "category_metrics": { + "unknown": { + "code_edit": 0.8293333333333334, + "formula_edit": 0.7076023391812866, + "table_edit": 0.9629629629629629, + "table_TEDS": 1.0, + "text_edit": 0.6105951152390782, + "overall": 0.8220987501433322 + } + }, + "error_analysis": { + "total_samples": 3, + "failed_count": 0, + "success_rate": 1.0, + "common_errors": {}, + "sample_errors": [] + }, + "extractor_config": { + "model_path": "/Users/chupei/model/checkpoint-3296" + }, + "metric_config": {} +} \ No newline at end of file