Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions configs/all_tasks.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{
"infer": {
"posttrain": ["arc_ar", "arc_bn", "arc_ca", "arc_da", "arc_de", "arc_es", "arc_eu", "arc_fr", "arc_gu", "arc_hi", "arc_hr", "arc_hu", "arc_hy", "arc_id", "arc_it", "arc_kn", "arc_ml", "arc_mr", "arc_ne", "arc_nl", "arc_pt", "arc_ro", "arc_ru", "arc_sk", "arc_sr", "arc_sv", "arc_ta", "arc_te", "arc_uk", "arc_vi", "arc_zh", "global_mmlu_ar", "global_mmlu_bn", "global_mmlu_de", "global_mmlu_es", "global_mmlu_fr", "global_mmlu_hi", "global_mmlu_id", "global_mmlu_it", "global_mmlu_ja", "global_mmlu_ko", "global_mmlu_pt", "global_mmlu_sw", "global_mmlu_yo", "global_mmlu_zh", "hellaswag", "hellaswag_ar", "hellaswag_bn", "hellaswag_ca", "hellaswag_da", "hellaswag_de", "hellaswag_es", "hellaswag_eu", "hellaswag_fr", "hellaswag_gu", "hellaswag_hi", "hellaswag_hr", "hellaswag_hu", "hellaswag_hy", "hellaswag_id", "hellaswag_it", "hellaswag_kn", "hellaswag_ml", "hellaswag_mr", "hellaswag_ne", "hellaswag_nl", "hellaswag_pt", "hellaswag_ro", "hellaswag_ru", "hellaswag_sk", "hellaswag_sr", "hellaswag_sv", "hellaswag_ta", "hellaswag_te", "hellaswag_uk", "hellaswag_vi", "include_base_44_albanian", "include_base_44_arabic", "include_base_44_armenian", "include_base_44_azerbaijani", "include_base_44_basque", "include_base_44_belarusian", "include_base_44_bengali", "include_base_44_bulgarian", "include_base_44_chinese", "include_base_44_croatian", "include_base_44_finnish", "include_base_44_french", "include_base_44_georgian", "include_base_44_german", "include_base_44_hebrew", "include_base_44_hindi", "include_base_44_hungarian", "include_base_44_indonesian", "include_base_44_italian", "include_base_44_japanese", "include_base_44_kazakh", "include_base_44_korean", "include_base_44_lithuanian", "include_base_44_malay", "include_base_44_malayalam", "include_base_44_nepali", "include_base_44_persian", "include_base_44_polish", "include_base_44_portuguese", "include_base_44_russian", "include_base_44_serbian", "include_base_44_spanish", "include_base_44_tagalog", "include_base_44_tamil", "include_base_44_telugu", "include_base_44_turkish", "include_base_44_ukrainian", "include_base_44_uzbek", "include_base_44_vietnamese", "mmlu", "winogrande", "xcopa_et", "xcopa_ht", "xcopa_qu", "xnli_ar", "xnli_bg", "xnli_de", "xnli_el", "xnli_en", "xnli_es", "xnli_fr", "xnli_hi", "xnli_ru", "xnli_sw", "xnli_th", "xnli_tr", "xnli_ur", "xnli_vi", "xnli_zh", "xwinograd_en", "xwinograd_fr", "xwinograd_pt", "xwinograd_ru", "xwinograd_zh"]
},
"other": [
{"name": "ai2_arc", "kinds": ["pretrain"], "size": null, "language": "en", "dimension": null, "alias": ["arc_easy", "arc_challenge"]},
{"name": "include_base_44_greek", "kinds": ["pretrain"], "size": null, "language": "el", "dimension": null},
{"name": "include_base_44_north macedonian", "kinds": ["pretrain"], "size": null, "language": "mk", "dimension": null},
{"name": "xwinograd_jp", "kinds": ["pretrain"], "size": null, "language": "ja", "dimension": null},
{"name": "include_base_44_dutch", "kinds": ["pretrain"], "size": 576, "language": null, "dimension": null},
{"name": "piqa", "kinds": ["pretrain"], "size": 21000, "language": null, "dimension": null},
{"name": "switzerland_qa_de", "kinds": ["pretrain"], "size": 9160, "language": null, "dimension": null},
{"name": "switzerland_qa_fr", "kinds": ["pretrain"], "size": 9160, "language": null, "dimension": null},
{"name": "switzerland_qa_it", "kinds": ["pretrain"], "size": 9160, "language": null, "dimension": null},
{"name": "switzerland_qa_rm", "kinds": ["pretrain"], "size": 9160, "language": null, "dimension": null},
{"name": "switzerland_qa_en", "kinds": ["pretrain"], "size": 9160, "language": null, "dimension": null},
{"name": "cultural_bench", "kinds": ["pretrain"], "size": null, "language": "en", "dimension": null},
{"name": "blend_algeria", "kinds": ["pretrain"], "size": 20364, "language": "ar", "kind": null},
{"name": "blend_assam", "kinds": ["pretrain"], "size": 21293, "language": "as", "kind": null},
{"name": "blend_azerbaijan", "kinds": ["pretrain"], "size": 19932, "language": "az", "kind": null},
{"name": "blend_china", "kinds": ["pretrain"], "size": 20410, "language": "zh", "kind": null},
{"name": "blend_ethiopia", "kinds": ["pretrain"], "size": 22712, "language": "am", "kind": null},
{"name": "blend_greece", "kinds": ["pretrain"], "size": 20383, "language": "el", "kind": null},
{"name": "blend_indonesia", "kinds": ["pretrain"], "size": 18417, "language": "id", "kind": null},
{"name": "blend_iran", "kinds": ["pretrain"], "size": 19371, "language": "fa", "kind": null},
{"name": "blend_mexico", "kinds": ["pretrain"], "size": 20513, "language": "es", "kind": null},
{"name": "blend_north_korea", "kinds": ["pretrain"], "size": 17005, "language": "ko", "kind": null},
{"name": "blend_northern_nigeria", "kinds": ["pretrain"], "size": 16317, "language": "ha", "kind": null},
{"name": "blend_south_korea", "kinds": ["pretrain"], "size": 21439, "language": "ko", "kind": null},
{"name": "blend_spain", "kinds": ["pretrain"], "size": 19280, "language": "es", "kind": null},
{"name": "blend_uk", "kinds": ["pretrain"], "size": 16723, "language": "en", "kind": null},
{"name": "blend_us", "kinds": ["pretrain"], "size": 16491, "language": "en", "kind": null},
{"name": "blend_west_java", "kinds": ["pretrain"], "size": 15289, "language": "su", "kind": null}
]
}
5 changes: 3 additions & 2 deletions configs/automation.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,11 @@
"/capstor/scratch/cscs/asolergi/main_run_70B_megatron/Megatron-LM/logs/Meg-Runs/main-runs-v1/apertus3-70b-512-nodes-1e-5lr/checkpoints",
"/capstor/scratch/cscs/asolergi/main_run_70B_megatron/Megatron-LM/logs/Meg-Runs/main-runs-v1/apertus3-70b-512-nodes-1e-5lr/checkpoints-512-noOverlap"
],
"max_samples": 500000,
"size": 70,
"tokens_per_iter": "8388608:523519,16777216:",
"frequency": 30000,
"start_eval_from": 830000
"frequency": 15000,
"start_eval_from": 1070000
}
}
}
52 changes: 46 additions & 6 deletions configs/tasks.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,49 @@
{
"show_in_table": ["mmlu/acc", "gsm8k/exact_match", "arc_challenge/acc", "hellaswag/acc", "m_hellaswag/acc", "m_arc/acc", "include_base_44/acc"],
"root": "swissai_eval",
"groups": {
"swissai_eval": ["mmlu", "hellaswag", "mmlu_continuation", "winogrande", "piqa", "openbookqa", "arc_challenge", "arc_easy", "commonsense_qa", "lambada_openai", "lambada_standard", "wikitext", "gsm8k", "squadv2", "include_base_44", "xcopa", "xnli", "xwinograd", "pawsx", "m_arc", "global_mmlu", "m_hellaswag"],
"english": ["mmlu", "hellaswag", "mmlu_continuation", "winogrande", "piqa", "openbookqa", "arc_challenge", "arc_easy", "commonsense_qa", "lambada_openai", "lambada_standard", "wikitext", "gsm8k", "squadv2"],
"multilingual": ["include_base_44", "xcopa", "xnli", "xwinograd", "pawsx", "m_arc", "global_mmlu", "m_hellaswag"]
"show_in_table": [
"mmlu/acc",
"hellaswag/acc"
],
"language_groups": {
"english": [
"en"
],
"swiss": [
"de",
"fr",
"it",
"rm"
],
"eu": [
"sq",
"hy",
"eu",
"be",
"bg",
"ca",
"hr",
"da",
"nl",
"en",
"et",
"fi",
"fr",
"ka",
"de",
"el",
"hu",
"it",
"lt",
"mk",
"pl",
"pt",
"ro",
"rm",
"ru",
"sr",
"sk",
"es",
"sv",
"uk"
]
}
}
11 changes: 10 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
[project]
name = "swissai-evals"
name = "evals"
version = "0.1.0"
description = "Swissai evaluation scripts"
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"iso639-lang>=2.6.1",
"pandas>=2.3.0",
"prtpy>=0.8.3",
"pyyaml>=6.0.2",
"requests>=2.32.4",
"wandb>=0.20.1",
]


[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
119 changes: 69 additions & 50 deletions scripts/automate.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,13 @@
import collections
import re
import os
import math
import json
import subprocess
import shutil
from pathlib import Path



def unify(completed: list[str]) -> list[str]:
completed_set = set(completed)
unified = []
for groupname, tasks in filter(lambda t: t[0] != ROOT_EVAL, TASKS["groups"].items()):
if set(tasks) <= completed_set:
unified.append(groupname)
return unified
from evals.tasks import Task, get_all_tasks, get_partition


def get_running(as_jobname: bool = False) -> dict[str, dict[int, list[str]] | list[str]]:
Expand All @@ -47,10 +40,7 @@ def get_running(as_jobname: bool = False) -> dict[str, dict[int, list[str]] | li
running.append(jobname)
else:
name, group, it = rmatch.groups()
if group == ROOT_EVAL:
running[name][int(it)] += ALL_EVALS
else:
running[name][int(it)].append(group)
running[name][int(it)] += group
return running


Expand All @@ -60,9 +50,9 @@ def get_evaluated(model: str) -> dict[int, list[str]]:
it = int(re.match("^iter_([0-9]+)$", path.parent.parent.parent.parent.name).group(1))
with open(path) as f:
info = json.load(f)
for task in info["results"]:
status[it].append(task)
return {it: unify(tasks) for it, tasks in status.items()}
for taskname in info["results"]:
status[it].append(taskname)
return status


def get_available(model_dirs: list[Path]) -> list[int]:
Expand All @@ -73,49 +63,81 @@ def get_available(model_dirs: list[Path]) -> list[int]:
return available


def submit(name: str, model: dict, it: int, tasks: list[str]):
task_alias = ROOT_EVAL if tasks == ALL_EVALS else " ".join(tasks)
tasks = " ".join(tasks)
def submit(name: str, model: dict, it: int, tasks: list[Task]):
# Get partition of tasks.
total_size = sum(task.size for task in ALL_TASKS)
n_shards = math.ceil(total_size/model["max_samples"])
partition = get_partition(tasks=tasks, shards=n_shards)
default_partition = get_partition(tasks=ALL_TASKS, shards=n_shards)

# Schedule all tasks requested.
path, = (model_dir for model_dir in model["model_dirs"]
if Path(f"{model_dir}/iter_{it:07d}").exists())
cmd = ["sbatch",
f"--job-name=eval_{name}_{task_alias}_{it}",
"scripts/evaluate.sbatch",
str(path),
str(it),
model["tokens_per_iter"],
name]
env = {**os.environ,
"LOGS_ROOT": CFG["logs_root"],
"TOKENIZER": "alehc/swissai-tokenizer",
"BOS": "true",
"SIZE": str(model["size"]),
"HF_TEMP_DIR": CFG["hf_temp_dir"],
"TASKS": tasks}
print("Launching", name, it, tasks, path)
subprocess.run(cmd, env=env, stdout=subprocess.PIPE)
for part in partition:
# Get special jobname depending on the tasks requested.
matches = [(i, default_part) for i, default_part in enumerate(default_partition)
if part == default_part]
if len(matches) == 0:
jobname = "mixed"
else:
(shard_i, _), = matches
jobname = f"shard{shard_i}of{n_shards}"
jobname = f"eval_{name}_{jobname}_{it}"

cmd = ["sbatch", f"--job-name={jobname}", "scripts/evaluate.sbatch", str(path),
str(it), model["tokens_per_iter"], name]
env = {**os.environ,
"LOGS_ROOT": CFG["logs_root"],
"TOKENIZER": "alehc/swissai-tokenizer",
"BOS": "true",
"SIZE": str(model["size"]),
"HF_TEMP_DIR": CFG["hf_temp_dir"],
"TASKS": ",".join(task.name for task in part)}
print("Launching", jobname)
subprocess.run(cmd, env=env, stdout=subprocess.PIPE)


def submit_needed():
running = get_running()
for name, model in CFG["models"].items():
total_size = sum(task.size for task in ALL_TASKS)
n_shards = math.ceil(total_size/model["max_samples"])
default_partition = get_partition(tasks=ALL_TASKS, shards=n_shards)

# Get tasks alredy evaluated (reading them from the `results.json`).
status = get_evaluated(name)
for it, tasks in running[name].items():
if it in status:
status[it] += tasks
else:
status[it] = tasks
default_partition = get_partition(tasks=ALL_TASKS, shards=n_shards)

# Handle already evaluated: if a "mixed" group is running, assume it will
# contain all missing tasks because we don't know which one does it contain in reality,
# otherwise obtain the correct shard.
for it, groups in running[name].items():
for group in groups:
if groups == "mixed":
actual_tasks = ALL_TASKS
else:
shard_i, total_shards = re.match("^shard([0-9]+)of([0-9]+)$", group).groups()
assert total_shards == n_shards
actual_tasks = default_partition[int(shard_i)]

if it in status:
status[it] += [task.name for task in actual_tasks]
else:
status[it] = [task.name for task in actual_tasks]

available = get_available(model["model_dirs"])
for it in available:
if (it - model["start_eval_from"]) % model["frequency"] == 0 and it >= model["start_eval_from"]:
missing = sorted(set(ALL_EVALS) - set(status.get(it, [])))
# Determine missing set.
missing = []
handled = status.get(it, [])
for task in ALL_TASKS:
if len(task.alias) > 0 and any(actual_name not in handled for actual_name in task.alias):
missing.append(task)
elif len(task.alias) == 0 and task.name not in handled:
missing.append(task)
if len(missing) > 0:
if model["size"] < 70:
submit(name, model, it, missing)
else:
for task in missing:
submit(name, model, it, [task])
submit(name, model, it, missing)


def update_hf_checkpoints():
Expand Down Expand Up @@ -172,10 +194,7 @@ def main():


if __name__ == "__main__":
ALL_TASKS = get_all_tasks()
with open("configs/automation.json") as f:
CFG = json.load(f)
with open("configs/tasks.json") as f:
TASKS = json.load(f)
ROOT_EVAL = TASKS["root"]
ALL_EVALS = sorted([task for task in TASKS["groups"] if task != ROOT_EVAL])
main()
Loading