Skip to content

Commit d90497a

Browse files
Merge pull request #12 from open-sciencelab/update-data
feat: update data & add scripts
2 parents 2469c5e + 39e5150 commit d90497a

File tree

22 files changed

+32000
-40
lines changed

22 files changed

+32000
-40
lines changed

scripts/summarize.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
import json
2+
import os
3+
import pandas as pd
4+
import argparse
5+
6+
task_map = {
7+
"1-1": "key_single_choice",
8+
"1-2": "key_multiple_choice",
9+
"1-3": "key_blank_fill",
10+
"1-4": "key_text_generation",
11+
"2-1": "prime_summary",
12+
"2-2": "key_info",
13+
"3-1": "context_single_choice",
14+
"3-2": "context_multiple_choice",
15+
"3-3": "context_blank_fill",
16+
"3-4": "context_text_generation",
17+
"3-5": "classification"
18+
}
19+
20+
def map_task_type(task_type):
21+
"""
22+
Map task type to task ID.
23+
"""
24+
for task_id, task_name in task_map.items():
25+
if task_name == task_type:
26+
return task_id
27+
return None
28+
29+
def count_task_split_type(folder_path):
30+
type_count = {}
31+
for file in os.listdir(folder_path):
32+
with open(os.path.join(folder_path, file), 'r') as f:
33+
data = json.load(f)
34+
for item in data:
35+
split = item['split']['level1']
36+
task_type = item['task_type']
37+
if split not in type_count:
38+
type_count[split] = {}
39+
if task_type not in type_count[split]:
40+
type_count[split][task_type] = 0
41+
type_count[split][task_type] += 1
42+
df = pd.DataFrame(type_count)
43+
df.fillna(0, inplace=True)
44+
df['total'] = df.sum(axis=1)
45+
df.loc['total'] = df.sum()
46+
df['total'] = df['total'].astype(int)
47+
48+
for col in df.columns:
49+
df[col] = df[col].astype(int)
50+
51+
total = df.loc['total', 'total']
52+
df['total'] = df['total'].astype(str) + ' (' + (df['total'] / total * 100).round(2).astype(str) + '%)'
53+
for col in df.columns:
54+
if col != 'total':
55+
df[col] = df[col].astype(str) + ' (' + (df[col] / total * 100).round(2).astype(str) + '%)'
56+
57+
df = df.reset_index().rename(columns={'index': 'task_type'})
58+
59+
df['task_id'] = df['task_type'].apply(map_task_type)
60+
df = df[['task_id', 'task_type'] + [col for col in df.columns if col not in ['task_id', 'task_type']]]
61+
df = df.sort_values(by='task_id').reset_index(drop=True)
62+
63+
df.to_csv('task_split_type_count.csv', encoding='utf-8-sig', index=False)
64+
65+
if __name__ == "__main__":
66+
parser = argparse.ArgumentParser(description="Count task split types in a folder.")
67+
parser.add_argument('folder_path', type=str, help='Path to the folder containing JSON files.')
68+
args = parser.parse_args()
69+
70+
count_task_split_type(args.folder_path)
71+

scripts/update.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
import os
2+
import json
3+
import argparse
4+
5+
def update_data(source_file, target_file):
6+
"""
7+
Update the target file with data from the source file.
8+
If the target file does not exist, it will be created.
9+
"""
10+
if not os.path.exists(target_file):
11+
with open(target_file, 'w') as f:
12+
json.dump([], f) # Create an empty list if the file does not exist
13+
14+
with open(source_file, 'r') as f:
15+
update_data = json.load(f)
16+
17+
with open(target_file, 'r') as f:
18+
target_data = json.load(f)
19+
20+
print(f"Before update: {len(target_data)} items in {target_file}")
21+
target_data.extend(update_data)
22+
print(f"After update: {len(target_data)} items in {target_file}")
23+
24+
return target_data
25+
26+
def deduplicate_data(data):
27+
"""
28+
Deduplicate the data based on 'instruction' and 'question'.
29+
"""
30+
seen = set()
31+
deduplicated_data = []
32+
for item in data:
33+
key = (item['instruction'], item['question'])
34+
if key not in seen:
35+
seen.add(key)
36+
deduplicated_data.append(item)
37+
print(f"Deduplicated data: {len(deduplicated_data)} items")
38+
return deduplicated_data
39+
40+
def save_data(data, target_file):
41+
"""
42+
Save the data to the target file.
43+
"""
44+
with open(target_file, 'w') as f:
45+
json.dump(data, f, ensure_ascii=False, indent=4)
46+
print(f"Data saved to {target_file}")
47+
48+
49+
if __name__ == '__main__':
50+
parser = argparse.ArgumentParser(description="Update and deduplicate data files.")
51+
parser.add_argument('--src_dir', type=str, required=True, help='Source directory containing the data files.')
52+
parser.add_argument('--target_dir', type=str, required=True, help='Target directory to save the updated data files.')
53+
54+
args = parser.parse_args()
55+
src_dir = args.src_dir
56+
target_dir = args.target_dir
57+
58+
folders = ['one-shot', 'zero-shot']
59+
for folder in folders:
60+
for src_file in os.listdir(os.path.join(src_dir, folder)):
61+
if src_file.endswith('.json'):
62+
src_file_path = os.path.join(src_dir, folder, src_file)
63+
target_file_path = os.path.join(target_dir, folder, src_file)
64+
print(f"Updating {target_file_path} with data from {src_file_path}")
65+
updated_data = update_data(src_file_path, target_file_path)
66+
deduplicated_data = deduplicate_data(updated_data)
67+
save_data(deduplicated_data, target_file_path)
68+
print("Data update completed.")
69+

0 commit comments

Comments
 (0)