InternScience
diff --git a/‎scripts/summarize.py‎
Lines changed: 71 additions & 0 deletions b/‎scripts/summarize.py‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎scripts/update.py‎
Lines changed: 69 additions & 0 deletions b/‎scripts/update.py‎
Lines changed: 69 additions & 0 deletions
@@ -0,0 +1,71 @@
+import json
+import os
+import pandas as pd
+import argparse
+
+task_map = {
+    "1-1": "key_single_choice",
+    "1-2": "key_multiple_choice",
+    "1-3": "key_blank_fill",
+    "1-4": "key_text_generation",
+    "2-1": "prime_summary",
+    "2-2": "key_info",
+    "3-1": "context_single_choice",
+    "3-2": "context_multiple_choice",
+    "3-3": "context_blank_fill",
+    "3-4": "context_text_generation",
+    "3-5": "classification"
+}
+
+def map_task_type(task_type):
+    """
+    Map task type to task ID.
+    """
+    for task_id, task_name in task_map.items():
+        if task_name == task_type:
+            return task_id
+    return None
+
+def count_task_split_type(folder_path):
+    type_count = {}
+    for file in os.listdir(folder_path):
+        with open(os.path.join(folder_path, file), 'r') as f:
+            data = json.load(f)
+        for item in data:
+            split = item['split']['level1']
+            task_type = item['task_type']
+            if split not in type_count:
+                type_count[split] = {}
+            if task_type not in type_count[split]:
+                type_count[split][task_type] = 0
+            type_count[split][task_type] += 1
+    df = pd.DataFrame(type_count)
+    df.fillna(0, inplace=True)
+    df['total'] = df.sum(axis=1)
+    df.loc['total'] = df.sum()
+    df['total'] = df['total'].astype(int)
+
+    for col in df.columns:
+        df[col] = df[col].astype(int)
+
+    total = df.loc['total', 'total']
+    df['total'] = df['total'].astype(str) + ' (' + (df['total'] / total * 100).round(2).astype(str) + '%)'
+    for col in df.columns:
+        if col != 'total':
+            df[col] = df[col].astype(str) + ' (' + (df[col] / total * 100).round(2).astype(str) + '%)'
+
+    df = df.reset_index().rename(columns={'index': 'task_type'})
+
+    df['task_id'] = df['task_type'].apply(map_task_type)
+    df = df[['task_id', 'task_type'] + [col for col in df.columns if col not in ['task_id', 'task_type']]]
+    df = df.sort_values(by='task_id').reset_index(drop=True)
+
+    df.to_csv('task_split_type_count.csv', encoding='utf-8-sig', index=False)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Count task split types in a folder.")
+    parser.add_argument('folder_path', type=str, help='Path to the folder containing JSON files.')
+    args = parser.parse_args()
+
+    count_task_split_type(args.folder_path)
+ 
@@ -0,0 +1,69 @@
+import os
+import json
+import argparse
+
+def update_data(source_file, target_file):
+    """
+    Update the target file with data from the source file.
+    If the target file does not exist, it will be created.
+    """
+    if not os.path.exists(target_file):
+        with open(target_file, 'w') as f:
+            json.dump([], f)  # Create an empty list if the file does not exist
+
+    with open(source_file, 'r') as f:
+        update_data = json.load(f)
+
+    with open(target_file, 'r') as f:
+        target_data = json.load(f)
+
+    print(f"Before update: {len(target_data)} items in {target_file}")
+    target_data.extend(update_data)
+    print(f"After update: {len(target_data)} items in {target_file}")
+
+    return target_data
+
+def deduplicate_data(data):
+    """
+    Deduplicate the data based on 'instruction' and 'question'.
+    """
+    seen = set()
+    deduplicated_data = []
+    for item in data:
+        key = (item['instruction'], item['question'])
+        if key not in seen:
+            seen.add(key)
+            deduplicated_data.append(item)
+    print(f"Deduplicated data: {len(deduplicated_data)} items")
+    return deduplicated_data
+
+def save_data(data, target_file):
+    """
+    Save the data to the target file.
+    """
+    with open(target_file, 'w') as f:
+        json.dump(data, f, ensure_ascii=False, indent=4)
+    print(f"Data saved to {target_file}")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="Update and deduplicate data files.")
+    parser.add_argument('--src_dir', type=str, required=True, help='Source directory containing the data files.')
+    parser.add_argument('--target_dir', type=str, required=True, help='Target directory to save the updated data files.')
+
+    args = parser.parse_args()
+    src_dir = args.src_dir
+    target_dir = args.target_dir
+
+    folders = ['one-shot', 'zero-shot']
+    for folder in folders:
+        for src_file in os.listdir(os.path.join(src_dir, folder)):
+            if src_file.endswith('.json'):
+                src_file_path = os.path.join(src_dir, folder, src_file)
+                target_file_path = os.path.join(target_dir, folder, src_file)
+                print(f"Updating {target_file_path} with data from {src_file_path}")
+                updated_data = update_data(src_file_path, target_file_path)
+                deduplicated_data = deduplicate_data(updated_data)
+                save_data(deduplicated_data, target_file_path)
+    print("Data update completed.")
+