Skip to content

Commit 39e5150

Browse files
feat: add summarize.py
1 parent 5575545 commit 39e5150

File tree

1 file changed

+71
-0
lines changed

1 file changed

+71
-0
lines changed

scripts/summarize.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
import json
2+
import os
3+
import pandas as pd
4+
import argparse
5+
6+
task_map = {
7+
"1-1": "key_single_choice",
8+
"1-2": "key_multiple_choice",
9+
"1-3": "key_blank_fill",
10+
"1-4": "key_text_generation",
11+
"2-1": "prime_summary",
12+
"2-2": "key_info",
13+
"3-1": "context_single_choice",
14+
"3-2": "context_multiple_choice",
15+
"3-3": "context_blank_fill",
16+
"3-4": "context_text_generation",
17+
"3-5": "classification"
18+
}
19+
20+
def map_task_type(task_type):
21+
"""
22+
Map task type to task ID.
23+
"""
24+
for task_id, task_name in task_map.items():
25+
if task_name == task_type:
26+
return task_id
27+
return None
28+
29+
def count_task_split_type(folder_path):
30+
type_count = {}
31+
for file in os.listdir(folder_path):
32+
with open(os.path.join(folder_path, file), 'r') as f:
33+
data = json.load(f)
34+
for item in data:
35+
split = item['split']['level1']
36+
task_type = item['task_type']
37+
if split not in type_count:
38+
type_count[split] = {}
39+
if task_type not in type_count[split]:
40+
type_count[split][task_type] = 0
41+
type_count[split][task_type] += 1
42+
df = pd.DataFrame(type_count)
43+
df.fillna(0, inplace=True)
44+
df['total'] = df.sum(axis=1)
45+
df.loc['total'] = df.sum()
46+
df['total'] = df['total'].astype(int)
47+
48+
for col in df.columns:
49+
df[col] = df[col].astype(int)
50+
51+
total = df.loc['total', 'total']
52+
df['total'] = df['total'].astype(str) + ' (' + (df['total'] / total * 100).round(2).astype(str) + '%)'
53+
for col in df.columns:
54+
if col != 'total':
55+
df[col] = df[col].astype(str) + ' (' + (df[col] / total * 100).round(2).astype(str) + '%)'
56+
57+
df = df.reset_index().rename(columns={'index': 'task_type'})
58+
59+
df['task_id'] = df['task_type'].apply(map_task_type)
60+
df = df[['task_id', 'task_type'] + [col for col in df.columns if col not in ['task_id', 'task_type']]]
61+
df = df.sort_values(by='task_id').reset_index(drop=True)
62+
63+
df.to_csv('task_split_type_count.csv', encoding='utf-8-sig', index=False)
64+
65+
if __name__ == "__main__":
66+
parser = argparse.ArgumentParser(description="Count task split types in a folder.")
67+
parser.add_argument('folder_path', type=str, help='Path to the folder containing JSON files.')
68+
args = parser.parse_args()
69+
70+
count_task_split_type(args.folder_path)
71+

0 commit comments

Comments
 (0)