Skip to content

Commit a5b02bf

Browse files
authored
Merge pull request #6 from e06084/main
feat: commit results
2 parents 62e4985 + adfcee6 commit a5b02bf

File tree

3 files changed

+330
-2
lines changed

3 files changed

+330
-2
lines changed

.gitignore

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,5 +45,4 @@ output/
4545
.coverage*
4646
coverage.xml
4747

48-
webmainbench.egg-info/*
49-
results/*
48+
webmainbench.egg-info/*
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
extractor,total_samples,success_rate,overall,code_edit,formula_edit,table_TEDS,table_edit,text_edit
2+
llm-webkit,3,1.0,0.8221,0.8293,0.7076,1.0,0.963,0.6106
Lines changed: 327 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,327 @@
1+
{
2+
"metadata": {
3+
"dataset_name": "llm_webkit_test",
4+
"extractor_name": "llm-webkit",
5+
"timestamp": "2025-07-31T13:52:12.948959",
6+
"total_samples": 3
7+
},
8+
"overall_metrics": {
9+
"code_edit": 0.8293333333333334,
10+
"formula_edit": 0.7076023391812866,
11+
"table_edit": 0.9629629629629629,
12+
"table_TEDS": 1.0,
13+
"text_edit": 0.6105951152390782,
14+
"overall": 0.8220987501433322
15+
},
16+
"sample_results": [
17+
{
18+
"sample_id": "text_code_sample",
19+
"extraction_success": true,
20+
"extraction_time": 3.6406631469726562,
21+
"metrics": {
22+
"code_edit": {
23+
"score": 0.488,
24+
"success": true,
25+
"details": {
26+
"distance": 64,
27+
"predicted_length": 125,
28+
"groundtruth_length": 61,
29+
"normalized": true,
30+
"predicted_code_length": 125,
31+
"groundtruth_code_length": 61,
32+
"content_type": "code"
33+
}
34+
},
35+
"formula_edit": {
36+
"score": 1.0,
37+
"success": true,
38+
"details": {
39+
"distance": 0,
40+
"predicted_length": 0,
41+
"groundtruth_length": 0,
42+
"normalized": true,
43+
"predicted_formula_length": 0,
44+
"groundtruth_formula_length": 0,
45+
"content_type": "formula"
46+
}
47+
},
48+
"table_edit": {
49+
"score": 1.0,
50+
"success": true,
51+
"details": {
52+
"distance": 0,
53+
"predicted_length": 0,
54+
"groundtruth_length": 0,
55+
"normalized": true,
56+
"predicted_table_length": 0,
57+
"groundtruth_table_length": 0,
58+
"content_type": "table"
59+
}
60+
},
61+
"table_TEDS": {
62+
"score": 1.0,
63+
"success": true,
64+
"details": {
65+
"edit_distance": 0.0,
66+
"predicted_nodes": 3,
67+
"groundtruth_nodes": 3,
68+
"max_nodes": 3,
69+
"structure_only": false,
70+
"algorithm": "TEDS",
71+
"content_type": "table"
72+
}
73+
},
74+
"text_edit": {
75+
"score": 0.9298245614035088,
76+
"success": true,
77+
"details": {
78+
"distance": 4,
79+
"predicted_length": 57,
80+
"groundtruth_length": 53,
81+
"normalized": true,
82+
"predicted_text_length": 57,
83+
"groundtruth_text_length": 53,
84+
"content_type": "text"
85+
}
86+
},
87+
"overall": {
88+
"score": 0.8835649122807018,
89+
"success": true,
90+
"details": {
91+
"source": "average_of_all_metrics",
92+
"description": "Overall score as average of all successful metrics",
93+
"successful_metrics": 5,
94+
"failed_metrics": 0,
95+
"individual_scores": {
96+
"code_edit": 0.488,
97+
"formula_edit": 1.0,
98+
"table_edit": 1.0,
99+
"table_TEDS": 1.0,
100+
"text_edit": 0.9298245614035088
101+
}
102+
}
103+
}
104+
},
105+
"sample_metadata": {
106+
"url": null,
107+
"domain": null,
108+
"language": null,
109+
"content_type": null,
110+
"difficulty": null
111+
}
112+
},
113+
{
114+
"sample_id": "table_sample",
115+
"extraction_success": true,
116+
"extraction_time": 1.6590700149536133,
117+
"metrics": {
118+
"code_edit": {
119+
"score": 1.0,
120+
"success": true,
121+
"details": {
122+
"distance": 0,
123+
"predicted_length": 0,
124+
"groundtruth_length": 0,
125+
"normalized": true,
126+
"predicted_code_length": 0,
127+
"groundtruth_code_length": 0,
128+
"content_type": "code"
129+
}
130+
},
131+
"formula_edit": {
132+
"score": 1.0,
133+
"success": true,
134+
"details": {
135+
"distance": 0,
136+
"predicted_length": 0,
137+
"groundtruth_length": 0,
138+
"normalized": true,
139+
"predicted_formula_length": 0,
140+
"groundtruth_formula_length": 0,
141+
"content_type": "formula"
142+
}
143+
},
144+
"table_edit": {
145+
"score": 0.8888888888888888,
146+
"success": true,
147+
"details": {
148+
"distance": 9,
149+
"predicted_length": 72,
150+
"groundtruth_length": 81,
151+
"normalized": true,
152+
"predicted_table_length": 72,
153+
"groundtruth_table_length": 81,
154+
"content_type": "table"
155+
}
156+
},
157+
"table_TEDS": {
158+
"score": 1.0,
159+
"success": true,
160+
"details": {
161+
"edit_distance": 0.0,
162+
"predicted_nodes": 13,
163+
"groundtruth_nodes": 13,
164+
"max_nodes": 13,
165+
"structure_only": false,
166+
"algorithm": "TEDS",
167+
"content_type": "table"
168+
}
169+
},
170+
"text_edit": {
171+
"score": 0.6666666666666667,
172+
"success": true,
173+
"details": {
174+
"distance": 3,
175+
"predicted_length": 9,
176+
"groundtruth_length": 6,
177+
"normalized": true,
178+
"predicted_text_length": 9,
179+
"groundtruth_text_length": 6,
180+
"content_type": "text"
181+
}
182+
},
183+
"overall": {
184+
"score": 0.9111111111111111,
185+
"success": true,
186+
"details": {
187+
"source": "average_of_all_metrics",
188+
"description": "Overall score as average of all successful metrics",
189+
"successful_metrics": 5,
190+
"failed_metrics": 0,
191+
"individual_scores": {
192+
"code_edit": 1.0,
193+
"formula_edit": 1.0,
194+
"table_edit": 0.8888888888888888,
195+
"table_TEDS": 1.0,
196+
"text_edit": 0.6666666666666667
197+
}
198+
}
199+
}
200+
},
201+
"sample_metadata": {
202+
"url": null,
203+
"domain": null,
204+
"language": null,
205+
"content_type": null,
206+
"difficulty": null
207+
}
208+
},
209+
{
210+
"sample_id": "formula_sample",
211+
"extraction_success": true,
212+
"extraction_time": 1.5354089736938477,
213+
"metrics": {
214+
"code_edit": {
215+
"score": 1.0,
216+
"success": true,
217+
"details": {
218+
"distance": 0,
219+
"predicted_length": 0,
220+
"groundtruth_length": 0,
221+
"normalized": true,
222+
"predicted_code_length": 0,
223+
"groundtruth_code_length": 0,
224+
"content_type": "code"
225+
}
226+
},
227+
"formula_edit": {
228+
"score": 0.1228070175438597,
229+
"success": true,
230+
"details": {
231+
"distance": 50,
232+
"predicted_length": 9,
233+
"groundtruth_length": 57,
234+
"normalized": true,
235+
"predicted_formula_length": 9,
236+
"groundtruth_formula_length": 57,
237+
"content_type": "formula"
238+
}
239+
},
240+
"table_edit": {
241+
"score": 1.0,
242+
"success": true,
243+
"details": {
244+
"distance": 0,
245+
"predicted_length": 0,
246+
"groundtruth_length": 0,
247+
"normalized": true,
248+
"predicted_table_length": 0,
249+
"groundtruth_table_length": 0,
250+
"content_type": "table"
251+
}
252+
},
253+
"table_TEDS": {
254+
"score": 1.0,
255+
"success": true,
256+
"details": {
257+
"edit_distance": 0.0,
258+
"predicted_nodes": 3,
259+
"groundtruth_nodes": 3,
260+
"max_nodes": 3,
261+
"structure_only": false,
262+
"algorithm": "TEDS",
263+
"content_type": "table"
264+
}
265+
},
266+
"text_edit": {
267+
"score": 0.23529411764705888,
268+
"success": true,
269+
"details": {
270+
"distance": 65,
271+
"predicted_length": 85,
272+
"groundtruth_length": 37,
273+
"normalized": true,
274+
"predicted_text_length": 85,
275+
"groundtruth_text_length": 37,
276+
"content_type": "text"
277+
}
278+
},
279+
"overall": {
280+
"score": 0.6716202270381837,
281+
"success": true,
282+
"details": {
283+
"source": "average_of_all_metrics",
284+
"description": "Overall score as average of all successful metrics",
285+
"successful_metrics": 5,
286+
"failed_metrics": 0,
287+
"individual_scores": {
288+
"code_edit": 1.0,
289+
"formula_edit": 0.1228070175438597,
290+
"table_edit": 1.0,
291+
"table_TEDS": 1.0,
292+
"text_edit": 0.23529411764705888
293+
}
294+
}
295+
}
296+
},
297+
"sample_metadata": {
298+
"url": null,
299+
"domain": null,
300+
"language": null,
301+
"content_type": null,
302+
"difficulty": null
303+
}
304+
}
305+
],
306+
"category_metrics": {
307+
"unknown": {
308+
"code_edit": 0.8293333333333334,
309+
"formula_edit": 0.7076023391812866,
310+
"table_edit": 0.9629629629629629,
311+
"table_TEDS": 1.0,
312+
"text_edit": 0.6105951152390782,
313+
"overall": 0.8220987501433322
314+
}
315+
},
316+
"error_analysis": {
317+
"total_samples": 3,
318+
"failed_count": 0,
319+
"success_rate": 1.0,
320+
"common_errors": {},
321+
"sample_errors": []
322+
},
323+
"extractor_config": {
324+
"model_path": "/Users/chupei/model/checkpoint-3296"
325+
},
326+
"metric_config": {}
327+
}

0 commit comments

Comments
 (0)