Skip to content

Commit 5575545

Browse files
feat: update data & add scripts
1 parent 2469c5e commit 5575545

File tree

21 files changed

+31929
-40
lines changed

21 files changed

+31929
-40
lines changed

scripts/update.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
import os
2+
import json
3+
import argparse
4+
5+
def update_data(source_file, target_file):
6+
"""
7+
Update the target file with data from the source file.
8+
If the target file does not exist, it will be created.
9+
"""
10+
if not os.path.exists(target_file):
11+
with open(target_file, 'w') as f:
12+
json.dump([], f) # Create an empty list if the file does not exist
13+
14+
with open(source_file, 'r') as f:
15+
update_data = json.load(f)
16+
17+
with open(target_file, 'r') as f:
18+
target_data = json.load(f)
19+
20+
print(f"Before update: {len(target_data)} items in {target_file}")
21+
target_data.extend(update_data)
22+
print(f"After update: {len(target_data)} items in {target_file}")
23+
24+
return target_data
25+
26+
def deduplicate_data(data):
27+
"""
28+
Deduplicate the data based on 'instruction' and 'question'.
29+
"""
30+
seen = set()
31+
deduplicated_data = []
32+
for item in data:
33+
key = (item['instruction'], item['question'])
34+
if key not in seen:
35+
seen.add(key)
36+
deduplicated_data.append(item)
37+
print(f"Deduplicated data: {len(deduplicated_data)} items")
38+
return deduplicated_data
39+
40+
def save_data(data, target_file):
41+
"""
42+
Save the data to the target file.
43+
"""
44+
with open(target_file, 'w') as f:
45+
json.dump(data, f, ensure_ascii=False, indent=4)
46+
print(f"Data saved to {target_file}")
47+
48+
49+
if __name__ == '__main__':
50+
parser = argparse.ArgumentParser(description="Update and deduplicate data files.")
51+
parser.add_argument('--src_dir', type=str, required=True, help='Source directory containing the data files.')
52+
parser.add_argument('--target_dir', type=str, required=True, help='Target directory to save the updated data files.')
53+
54+
args = parser.parse_args()
55+
src_dir = args.src_dir
56+
target_dir = args.target_dir
57+
58+
folders = ['one-shot', 'zero-shot']
59+
for folder in folders:
60+
for src_file in os.listdir(os.path.join(src_dir, folder)):
61+
if src_file.endswith('.json'):
62+
src_file_path = os.path.join(src_dir, folder, src_file)
63+
target_file_path = os.path.join(target_dir, folder, src_file)
64+
print(f"Updating {target_file_path} with data from {src_file_path}")
65+
updated_data = update_data(src_file_path, target_file_path)
66+
deduplicated_data = deduplicate_data(updated_data)
67+
save_data(deduplicated_data, target_file_path)
68+
print("Data update completed.")
69+

sft_model_eval/one-shot/1-1.json

Lines changed: 1190 additions & 0 deletions
Large diffs are not rendered by default.

sft_model_eval/one-shot/1-2.json

Lines changed: 730 additions & 0 deletions
Large diffs are not rendered by default.

sft_model_eval/one-shot/1-3.json

Lines changed: 1580 additions & 10 deletions
Large diffs are not rendered by default.

sft_model_eval/one-shot/1-4.json

Lines changed: 1270 additions & 10 deletions
Large diffs are not rendered by default.

sft_model_eval/one-shot/2-1.json

Lines changed: 3190 additions & 0 deletions
Large diffs are not rendered by default.

sft_model_eval/one-shot/2-2.json

Lines changed: 3190 additions & 0 deletions
Large diffs are not rendered by default.

sft_model_eval/one-shot/3-1.json

Lines changed: 1200 additions & 0 deletions
Large diffs are not rendered by default.

sft_model_eval/one-shot/3-2.json

Lines changed: 730 additions & 0 deletions
Large diffs are not rendered by default.

sft_model_eval/one-shot/3-3.json

Lines changed: 1580 additions & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)