Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
237 commits
Select commit Hold shift + click to select a range
76400b9
test base pip install
Jul 14, 2025
f7b6a38
update config
Jul 14, 2025
963f389
add config files
Jul 14, 2025
297ca4e
fix circular import
Jul 14, 2025
4b7d476
readd path
Jul 14, 2025
bb24bb4
add sitepackages path
Jul 14, 2025
7f76944
removed naming conflict
Jul 14, 2025
68989e8
add files
Jul 14, 2025
58bb6da
remove arena import
Jul 14, 2025
c093d75
update generation import
Jul 14, 2025
3e0cb11
in python entrypoint
Jul 14, 2025
3cbf094
remove util in script
Jul 14, 2025
6c8004f
add path
Jul 14, 2025
c471ab5
test path
Jul 14, 2025
2254d61
test path
Jul 14, 2025
3390a3c
readd python path
Jul 14, 2025
5dfc8ae
direct function call
Jul 14, 2025
1c0c6c1
moved run
Jul 14, 2025
9d8acfe
readd module path
Jul 14, 2025
51a411b
moved start gen
Jul 14, 2025
5741c11
remove path
Jul 14, 2025
d4375b4
remove path
Jul 14, 2025
69efa9e
add python path
Jul 14, 2025
028d408
move run to scripts
Jul 14, 2025
5d07916
removed start_gen
Jul 14, 2025
6030405
moved pathlib
Jul 14, 2025
6355837
update path
Jul 14, 2025
ca18e29
update path
Jul 14, 2025
3aaeef0
update path
Jul 14, 2025
a7f362a
move run
Jul 14, 2025
c08bc43
add python path
Jul 14, 2025
c1c0a09
update python path
Jul 14, 2025
0d6190a
update path
Jul 14, 2025
31f8070
add site package to path
Jul 14, 2025
7c9cc07
update script path name
Jul 14, 2025
eb583a5
fix config path
Jul 14, 2025
d9852d7
after vllm
Jul 14, 2025
ce65017
clean up
Jul 15, 2025
f101255
rename to generate
Jul 15, 2025
b5ccd3f
reduce questions
Jul 15, 2025
869cff9
clean up generation
Jul 15, 2025
cb34022
update config dictionary name
Jul 15, 2025
c3e4f9f
clean up file paths
Jul 15, 2025
0ecf435
moved based path to top of script
Jul 15, 2025
c9abe09
base judge using answer
Jul 15, 2025
c12ecae
update to judgement
Jul 15, 2025
1b0ab9c
generation to judgement
Jul 16, 2025
1d13a0f
missing answer file
Jul 16, 2025
c0934de
add arenahard yaml
Jul 16, 2025
25d7312
read from gen judgement
Jul 16, 2025
37559ed
update to use artifact
Jul 16, 2025
9578493
update to reference different task
Jul 16, 2025
5a42e67
debug file location
Jul 16, 2025
951f431
add pathlib
Jul 16, 2025
e6d0b66
updated answer dir
Jul 16, 2025
23cb3a0
update output path for generation
Jul 16, 2025
5224491
add answer data
Jul 16, 2025
42ba3ef
readd gen
Jul 16, 2025
56e603a
add pathlib
Jul 16, 2025
e548e58
update judgment script to use now gen
Jul 16, 2025
62bc4f6
clean print
Jul 16, 2025
e52f7a1
readd os import
Jul 16, 2025
00cf20f
fix directory definitions
Jul 16, 2025
274398b
readd print
Jul 16, 2025
321827c
update dir for answer
Jul 16, 2025
886aa25
test output path
Jul 16, 2025
9858723
revert output
Jul 16, 2025
636148a
include json dump
Jul 16, 2025
79c1031
revert dump
Jul 17, 2025
421c684
change output
Jul 17, 2025
09e4f52
final gen
Jul 17, 2025
c4b737b
update judge to use the generate
Jul 17, 2025
174c341
use task id for judgement
Jul 17, 2025
397c950
update task to point to judgement model
Jul 17, 2025
6cc02f6
test with new model
Jul 18, 2025
75f85f5
updated max completion tokens
Jul 18, 2025
39b967e
test judgement with new model
Jul 18, 2025
7bfa7fb
if there's a taskid provided to judgement
Jul 18, 2025
54a5037
fix dict indexing
Jul 18, 2025
326fb48
reference yaml bench name
Jul 18, 2025
bd032b8
update generate to store based on bench name
Jul 18, 2025
53b0d7b
update model name for file output
Jul 18, 2025
5c47140
update judgement name
Jul 18, 2025
d6779b7
reference the answer model
Jul 18, 2025
073067f
add answer example json
Jul 18, 2025
f1fe759
reduced math tokens
Jul 18, 2025
c7199b4
update hyperparameters from config
Jul 22, 2025
18a01d1
revert to config
Jul 23, 2025
820fad6
use url
Jul 23, 2025
ea60e4b
test bucket
Jul 25, 2025
c4fd359
update config to use bucket
Jul 28, 2025
0530d1c
pin vllm version
Jul 28, 2025
1802a12
pin hf hub
Jul 28, 2025
c585437
pin older vllm version
Jul 29, 2025
e033ecb
update vllm
Jul 29, 2025
987cf62
vllm so logs show
Jul 29, 2025
c90cabe
lm eval vllm change
Jul 29, 2025
dd4bd9a
pin transformers version
Jul 29, 2025
2325637
hf hub
Jul 29, 2025
404d3a1
pinned transformers and latest vllm
Jul 29, 2025
f3e14b9
moved pins
Jul 29, 2025
0c12ec4
pin vllm
Jul 29, 2025
2ea32fb
revert back to vllm logging to txt file
Jul 29, 2025
3d9b897
revert to main vllm server script
Jul 29, 2025
81a4fe5
revert to main vllm server script
Jul 29, 2025
4a8d64e
vllm log to shell
Jul 31, 2025
288595b
update vllm
Jul 31, 2025
68b2bf9
add 0.10.0
Jul 31, 2025
d2e1d6b
back to vllm text log
Jul 31, 2025
a8feda3
transformers 4.53 and vllm 0.9.1
Aug 1, 2025
4fad19b
added delay
Aug 1, 2025
510e40d
remove delay
Aug 1, 2025
7238484
remove vllm text log
Aug 4, 2025
11b92f2
revert to working state
Aug 5, 2025
125c37b
pin triton
Aug 12, 2025
e207420
update config to match main
Aug 14, 2025
5fe12d0
updated libs for judgement
Aug 14, 2025
025f7f1
removed gpu count
Aug 14, 2025
70faf4f
updated base task to take pinned library versions
Aug 18, 2025
d5d4a32
clean up base task
Aug 18, 2025
99823d1
update to use previous task
Aug 18, 2025
05eb933
removed local path line
Aug 18, 2025
e13fdc2
use jinja
Aug 18, 2025
8c6fa31
update src with template changes
Aug 18, 2025
c650193
updated standards path
Aug 18, 2025
bb185be
update path to same yamls
Aug 18, 2025
a79f4a1
update paths
Aug 18, 2025
a064d1d
test path to standards
Aug 18, 2025
646fa6d
update path
Aug 18, 2025
b36431c
moved file dir save
Aug 18, 2025
2c01dbc
regenerate yaml
Aug 19, 2025
8a6a14c
update api config path
Aug 19, 2025
b612fbd
config gen path
Aug 19, 2025
2626ddd
added questions
Aug 19, 2025
9fe2c17
add tmp config files
Aug 19, 2025
a69581e
changed max token
Aug 19, 2025
f84859f
reduce tokens
Aug 19, 2025
723fe7c
moved all tmp files
Aug 19, 2025
1758bfc
rename tmp configs
Aug 19, 2025
a67c02e
input max tokens
Aug 19, 2025
e979512
clean up
Aug 19, 2025
96c8172
add parameterised judgement
Aug 19, 2025
638852e
import render yaml
Aug 19, 2025
580201f
use existing task
Aug 19, 2025
97ac6a7
update task parameters
Aug 19, 2025
4e70554
use current project name
Aug 19, 2025
7122e2a
fix syntax error
Aug 19, 2025
1cfd1ad
add support for default model as 03
Aug 19, 2025
e575b57
copy mini 03
Aug 19, 2025
5baacf5
add default queue to pipeline
Aug 19, 2025
0d70f03
remove base pipleine mod
Aug 19, 2025
b1b46dc
Merge branch 'main' into arena_upgrade
anmarques Aug 19, 2025
1e28932
full arenahard questions
Aug 20, 2025
8dbcf6e
updated example workflows
Aug 20, 2025
b86dcda
Merge remote-tracking branch 'refs/remotes/origin/arena_upgrade' into…
Aug 20, 2025
33ed070
allow use of short questions
Aug 20, 2025
d9a68d4
add category support to generate
Aug 20, 2025
6753f06
added all the model answers
Aug 20, 2025
4d6e305
debug category
Aug 20, 2025
4e7b8c3
update dict
Aug 20, 2025
258afee
retrieve dict value
Aug 20, 2025
a908dbf
changed to quesiton size
Aug 20, 2025
941484a
cleanup question size
Aug 20, 2025
43beabb
update judgement
Aug 20, 2025
ea6d183
remove hardcoded yamls
Aug 20, 2025
efc0cde
added arenahard examples
Aug 20, 2025
cf7e51a
refactor v2
Sep 17, 2025
bc02e9a
reference bench name correctly
Sep 17, 2025
144a1c9
add v0.1 templates
Sep 17, 2025
0413993
update judge to refactor v2
Sep 17, 2025
496034a
update to use judgement args
Sep 17, 2025
7f4a41c
updated examples with v0.1 support
Sep 17, 2025
64a9366
log vllm to file
Sep 24, 2025
ed8acf6
update tmp_arenahard_file name
Sep 25, 2025
1e43fbb
add arenahard examples
Sep 26, 2025
2c16d1a
add refactoring script
Sep 26, 2025
28dbf0d
used generic api base
Oct 3, 2025
1c24dd7
update target string
Oct 3, 2025
d784aa1
fix fstring error
Oct 6, 2025
3de5355
show logs
Oct 6, 2025
19e846d
reference judgement
Oct 6, 2025
08eb26c
update the generate task query mechanism
Oct 6, 2025
e88a6b7
fix syntax error
Oct 6, 2025
641f0ca
don't start vllm server for gemini model
Oct 6, 2025
0577e7c
add support for api type
Oct 6, 2025
b516df2
increase parallel
Oct 6, 2025
60b42da
added gemini env var
Oct 7, 2025
b523c6a
test
Oct 7, 2025
d7544b1
comment out log
Oct 7, 2025
e1469ef
run vllm if api key is present
Oct 7, 2025
df1ff1d
right model name
Oct 7, 2025
3ae5d84
clearer artifact name
Oct 7, 2025
c48330e
add default answers
Oct 7, 2025
db01641
print args
Oct 7, 2025
06cbf93
tmp assertions
Oct 7, 2025
1c15c67
hard code id
Oct 7, 2025
d43cffe
update output path to use judgement
Oct 7, 2025
5ca6405
get judgment results
Oct 7, 2025
647bc14
added api_type
Oct 8, 2025
730a777
readd api_type
Oct 8, 2025
de11ceb
add conditional temp
Oct 8, 2025
40e7473
add conditional temp
Oct 8, 2025
ebbfa21
remove asserts
Oct 8, 2025
ff4c0ae
lower case model
Oct 8, 2025
6c71eb8
updated hardcoded task id
Oct 8, 2025
759027f
updated hardcoded task id
Oct 8, 2025
f6594b1
tmp move model answer
Oct 8, 2025
900fcae
fix syntax error
Oct 8, 2025
b6af8a3
hardcode answer path
Oct 8, 2025
65a915a
add make dir
Oct 8, 2025
9d3d55e
fix syntax error
Oct 8, 2025
9699515
default make dir
Oct 8, 2025
c043b6d
default make dirs
Oct 8, 2025
95bd97b
default makedirs
Oct 8, 2025
288fe9b
change judge name
Oct 8, 2025
f5f0b8d
reference correct model to judge
Oct 8, 2025
dea71df
update template arenahards
Oct 8, 2025
df38068
rename answer model file saving name
Oct 8, 2025
92746a2
simplify rename answer model file saving name
Oct 8, 2025
b521083
move pull answers
Oct 8, 2025
64e330f
improve log
Oct 8, 2025
d352ab8
fix order
Oct 8, 2025
a66d7c9
retrieve answer_model var
Oct 8, 2025
b60ad8e
updated output file name
Oct 8, 2025
9b95346
artifact object print
Oct 8, 2025
5d9656f
add print
Oct 8, 2025
c756947
ensure directory exists before copy
Oct 8, 2025
d16165d
readd model answer
Oct 8, 2025
a17fef5
full questions
Oct 8, 2025
bd13d4f
change output
Oct 8, 2025
81d6314
output after storage manager pull
Oct 8, 2025
acfcc58
clean up output
Oct 8, 2025
ad374fb
re-add output path
Oct 8, 2025
8afb240
full questions
Oct 8, 2025
c788a96
added temperature
Oct 8, 2025
f3057ab
remove hardcode
Oct 8, 2025
2c3f978
get task via taskid
Oct 8, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions examples/arenahard_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from automation.pipelines import Pipeline
from automation.tasks import ArenaHardGenerateTask, ArenaHardJudgeTask

arena_hard_project_name="dsmall_pipeline_debug"
step1 = ArenaHardGenerateTask(
project_name= arena_hard_project_name,
task_name="generate_pipeline_task1",
packages = ["huggingface-hub==0.34.3", "triton==3.3.1","vllm==0.10.1.1"],
generate_model="meta-llama/Llama-3.2-1B-Instruct",
question_size = "small",
rate_type="throughput",
backend="aiohttp_server",
target="http://localhost:8000/v1",
server_wait_time = 600,
bench_name = "arena-hard-v2.0",
#bench_name = "arena-hard-v0.1",
branch = "arena_upgrade",
vllm_kwargs={"enable-chunked-prefill": True},
max_tokens = 1024,
)

step1.create_task()


step2 = ArenaHardJudgeTask(
project_name= arena_hard_project_name,
task_name="judge_pipeline_task1",
packages = ["huggingface-hub==0.34.3", "triton==3.3.1","vllm==0.10.1.1"],
#judgement_model ="Qwen/Qwen2-7B-Instruct",
judgement_model = "openai/gpt-oss-120b",
question_size = "small",
rate_type="throughput",
backend="aiohttp_server",
target="http://localhost:8000/v1",
server_wait_time = 600,
bench_name = "arena-hard-v2.0",
#bench_name = "arena-hard-v0.1",
branch = "arena_upgrade",
vllm_kwargs={"enable-chunked-prefill": True},
max_tokens = 1024,
)

step2.create_task()

pipeline = Pipeline(
project_name= arena_hard_project_name,
pipeline_name="pipeline_arenahard",
)


pipeline.add_step(
name="pipeline_arenahard_gen_step_1",
base_task_id = step1.id,
execution_queue="oneshot-a100x1",
#monitor_models=[step1.get_arguments()["Args"]["save_directory"]],
#monitor_artifacts=["recipe"],
)



config_override = {**step1.get_configurations()['ArenaHard'], **{"answer_project_name": arena_hard_project_name}, **{"answer_task_name" : pipeline.steps[0][1]['name'] }}

pipeline.add_step(
name="pipeline_arenahard_judge_step_2",
base_task_id = step2.id,
parents=["pipeline_arenahard_gen_step_1"],
execution_queue="oneshot-a100x4",
#parameter_override={"Args/model_id": "${pipeline_arenahard_generate_step1.models.output.-1.id}"},
configuration_overrides={"ArenaHard" : config_override },
#monitor_metrics=[("gsm8k", "exact_match,strict-match")],
)


pipeline.execute_remotely()
#pipeline.execute_remotely("oneshot-a100x1")
21 changes: 21 additions & 0 deletions examples/generate_arenahard_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from automation.tasks import ArenaHardGenerateTask

task = ArenaHardGenerateTask(
project_name="gpt_arena_debug",
task_name="generate_math_task_gpt",
packages = ["huggingface-hub==0.34.3", "triton==3.3.1", "vllm==0.10.1.1"],
generate_model= "RedHatAI/Qwen2.5-0.5B-Instruct-quantized.w8a8",
#generate_model= "openai/gpt-oss-120b",
question_size = "small",
rate_type="throughput",
backend="aiohttp_server",
target="http://localhost:8000/v1",
bench_name = "arena-hard-v2.0",
#bench_name = "arena-hard-v0.1",
branch = "arena_upgrade",
vllm_kwargs={"enable-chunked-prefill": True},
max_tokens = 1024,
)

task.execute_remotely("oneshot-a100x1")
#task.execute_locally()
20 changes: 20 additions & 0 deletions examples/jira_arenahard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from automation.tasks import ArenaHardGenerateTask

task = ArenaHardGenerateTask(
project_name="simple_debug",
task_name="generate_math_task_gpt",
packages = ["huggingface-hub==0.34.3", "triton==3.3.1","vllm==0.10.0"],
generate_model="openai/gpt-oss-120b",
question_size = "small",
rate_type="throughput",
backend="aiohttp_server",
target="http://localhost:8000/v1",
data="prompt_tokens=128,output_tokens=128",
branch = "arena_upgrade",
vllm_kwargs={"enable-chunked-prefill": True},
max_tokens = 1300,

)

task.execute_remotely("oneshot-a100x4")
#task.execute_locally()
106 changes: 106 additions & 0 deletions examples/jira_arenahard_generation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
from automation.tasks import ArenaHardGenerateTask

model_queue_list_dict = [
{"Qwen/Qwen3-0.6B" :"oneshot-a100x1"},
{"RedHatAI/Qwen3-0.6B-quantized.w4a16":"oneshot-a100x1"},

{"Qwen/Qwen3-1.7B":"oneshot-a100x1"},
{"RedHatAI/Qwen3-1.7B-quantized.w4a16":"oneshot-a100x1"},

{"Qwen/Qwen3-4B":"oneshot-a100x1"},
{"RedHatAI/Qwen3-4B-quantized.w4a16":"oneshot-a100x1"},

{"Qwen/Qwen3-8B":"oneshot-a100x1"},
{"RedHatAI/Qwen3-8B-quantized.w4a16":"oneshot-a100x1"},

{"Qwen/Qwen3-14B":"oneshot-a100x1"},
{"RedHatAI/Qwen3-14B-quantized.w4a16":"oneshot-a100x1"},

{"Qwen/Qwen3-32B":"oneshot-a100x2"},
{"RedHatAI/Qwen3-32B-quantized.w4a16":"oneshot-a100x2"},

{"Qwen/Qwen3-30B-A3B":"oneshot-a100x2"},
{"RedHatAI/Qwen3-30B-A3B-quantized.w4a16":"oneshot-a100x2"},

{"meta-llama/Llama-3.1-8B-Instruct":"oneshot-a100x1"},
{"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16":"oneshot-a100x1"},
{"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8":"oneshot-a100x1"},

{"meta-llama/Llama-3.2-1B-Instruct":"oneshot-a100x1"},
{"RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8":"oneshot-a100x1"},

{"meta-llama/Llama-3.2-3B-Instruct":"oneshot-a100x1"},
{"RedHatAI/Llama-3.2-3B-Instruct-quantized.w8a8":"oneshot-a100x1"},

{"meta-llama/Llama-3.3-70B-Instruct": "oneshot-a100x4"},
{"RedHatAI/Llama-3.3-70B-Instruct-quantized.w4a16": "oneshot-a100x4"},
{"RedHatAI/Llama-3.3-70B-Instruct-quantized.w8a8": "oneshot-a100x4"},

{"meta-llama/Llama-4-Scout-17B-16E": "oneshot-a100x8"},
{"RedHatAI/Llama-4-Scout-17B-16E-Instruct-quantized.w4a16": "oneshot-a100x8"},

{"microsoft/phi-4" :"oneshot-a100x1"},
{"RedHatAI/phi-4-quantized.w4a16" :"oneshot-a100x1"},
{"RedHatAI/phi-4-quantized.w8a8" :"oneshot-a100x1"},

{"mistralai/Mistral-Small-3.1-24B-Instruct-2503" :"oneshot-a100x2"},
{"RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8" :"oneshot-a100x2"},
{"RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16" :"oneshot-a100x2"},
]




model_queue_list_dict = [
{"meta-llama/Llama-3.3-70B-Instruct": "oneshot-a100x4"},
{"meta-llama/Llama-3.1-8B-Instruct":"oneshot-a100x1"},
{"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16":"oneshot-a100x1"},
{"RedHatAI/Qwen3-0.6B-quantized.w4a16":"oneshot-a100x1"},
{"RedHatAI/Qwen3-1.7B-quantized.w4a16":"oneshot-a100x1"},
]

model_queue_list_dict = [
#{"meta-llama/Llama-4-Scout-17B-16E": "oneshot-a100x8"},
{"Qwen/Qwen3-8B":"oneshot-a100x1"},
]

def run_task(model_queue_dict):
model, queue = model_queue_dict.popitem()
project_name="jira_arenahard_generation"

if "Scout" in model:
task = ArenaHardGenerateTask(
project_name=project_name,
task_name= f"generate_task_{model.lower()}",
packages = ["huggingface-hub==0.34.3", "triton==3.3.1", "vllm==0.10.1.1"],
generate_model= model,
rate_type="throughput",
backend="aiohttp_server",
target="http://localhost:8000/v1",
branch = "arena_upgrade",
vllm_kwargs={"enable-chunked-prefill": True, "max-model-len": 60000},
max_tokens = 16000,
bench_name = "arena-hard-v2.0"
)

else:
task = ArenaHardGenerateTask(
project_name=project_name,
task_name= f"generate_task_{model.lower()}",
packages = ["huggingface-hub==0.34.3", "triton==3.3.1", "vllm==0.10.1.1"],
generate_model= model,
rate_type="throughput",
backend="aiohttp_server",
target="http://localhost:8000/v1",
branch = "arena_upgrade",
vllm_kwargs={"enable-chunked-prefill": True },
max_tokens = 16000,
bench_name = "arena-hard-v2.0"
)

task.execute_remotely(queue)

for model_queue_dict in model_queue_list_dict:
run_task(model_queue_dict)

#task.execute_locally()
139 changes: 139 additions & 0 deletions examples/jira_arenahard_judging.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
from automation.tasks import ArenaHardJudgeTask

model_queue_list_dict = [
{"Qwen/Qwen3-0.6B" :"oneshot-a100x1"},
{"RedHatAI/Qwen3-0.6B-quantized.w4a16":"oneshot-a100x1"},

{"Qwen/Qwen3-1.7B":"oneshot-a100x1"},
{"RedHatAI/Qwen3-1.7B-quantized.w4a16":"oneshot-a100x1"},

{"Qwen/Qwen3-4B":"oneshot-a100x1"},
{"RedHatAI/Qwen3-4B-quantized.w4a16":"oneshot-a100x1"},

{"Qwen/Qwen3-8B":"oneshot-a100x1"},
{"RedHatAI/Qwen3-8B-quantized.w4a16":"oneshot-a100x1"},

{"Qwen/Qwen3-14B":"oneshot-a100x1"},
{"RedHatAI/Qwen3-14B-quantized.w4a16":"oneshot-a100x1"},

{"Qwen/Qwen3-32B":"oneshot-a100x2"},
{"RedHatAI/Qwen3-32B-quantized.w4a16":"oneshot-a100x2"},

{"Qwen/Qwen3-30B-A3B":"oneshot-a100x2"},
{"RedHatAI/Qwen3-30B-A3B-quantized.w4a16":"oneshot-a100x2"},

{"meta-llama/Llama-3.1-8B-Instruct":"oneshot-a100x1"},
{"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16":"oneshot-a100x1"},
{"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8":"oneshot-a100x1"},

{"meta-llama/Llama-3.2-1B-Instruct":"oneshot-a100x1"},
{"RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8":"oneshot-a100x1"},

{"meta-llama/Llama-3.2-3B-Instruct":"oneshot-a100x1"},
{"RedHatAI/Llama-3.2-3B-Instruct-quantized.w8a8":"oneshot-a100x1"},

{"meta-llama/Llama-3.3-70B-Instruct": "oneshot-a100x4"},
{"RedHatAI/Llama-3.3-70B-Instruct-quantized.w4a16": "oneshot-a100x4"},
{"RedHatAI/Llama-3.3-70B-Instruct-quantized.w8a8": "oneshot-a100x4"},

{"meta-llama/Llama-4-Scout-17B-16E": "oneshot-a100x8"},
{"RedHatAI/Llama-4-Scout-17B-16E-Instruct-quantized.w4a16": "oneshot-a100x8"},

{"microsoft/phi-4" :"oneshot-a100x1"},
{"RedHatAI/phi-4-quantized.w4a16" :"oneshot-a100x1"},
{"RedHatAI/phi-4-quantized.w8a8" :"oneshot-a100x1"},

{"mistralai/Mistral-Small-3.1-24B-Instruct-2503" :"oneshot-a100x2"},
{"RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8" :"oneshot-a100x2"},
{"RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16" :"oneshot-a100x2"},
]






# TODO:
model_queue_list_dict = [
{"RedHatAI/Qwen3-1.7B-quantized.w4a16":"oneshot-a100x1"},
{"Qwen/Qwen3-8B":"oneshot-a100x1"},
{"Qwen/Qwen3-14B":"oneshot-a100x1"},
{"Qwen/Qwen3-32B":"oneshot-a100x2"},
{"RedHatAI/Qwen3-32B-quantized.w4a16":"oneshot-a100x2"},
{"meta-llama/Llama-3.3-70B-Instruct": "oneshot-a100x4"},
{"RedHatAI/Llama-3.3-70B-Instruct-quantized.w4a16": "oneshot-a100x4"},
{"meta-llama/Llama-4-Scout-17B-16E": "oneshot-a100x8"},
{"RedHatAI/Llama-4-Scout-17B-16E-Instruct-quantized.w4a16": "oneshot-a100x8"},
]

model_queue_list_dict = [
#{"Qwen/Qwen3-0.6B" :"oneshot-a100x1"},
#{"RedHatAI/Qwen3-0.6B-quantized.w4a16":"oneshot-a100x1"},

{"Qwen/Qwen3-1.7B":"oneshot-a100x1"},

{"Qwen/Qwen3-4B":"oneshot-a100x1"},
{"RedHatAI/Qwen3-4B-quantized.w4a16":"oneshot-a100x1"},

{"RedHatAI/Qwen3-8B-quantized.w4a16":"oneshot-a100x1"},

{"RedHatAI/Qwen3-14B-quantized.w4a16":"oneshot-a100x1"},

{"Qwen/Qwen3-30B-A3B":"oneshot-a100x2"},
{"RedHatAI/Qwen3-30B-A3B-quantized.w4a16":"oneshot-a100x2"},

{"meta-llama/Llama-3.1-8B-Instruct":"oneshot-a100x1"},
{"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16":"oneshot-a100x1"},
{"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8":"oneshot-a100x1"},

{"meta-llama/Llama-3.2-1B-Instruct":"oneshot-a100x1"},
{"RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8":"oneshot-a100x1"},

{"meta-llama/Llama-3.2-3B-Instruct":"oneshot-a100x1"},
{"RedHatAI/Llama-3.2-3B-Instruct-quantized.w8a8":"oneshot-a100x1"},

{"RedHatAI/Llama-3.3-70B-Instruct-quantized.w8a8": "oneshot-a100x4"},

{"microsoft/phi-4" :"oneshot-a100x1"},
{"RedHatAI/phi-4-quantized.w4a16" :"oneshot-a100x1"},
{"RedHatAI/phi-4-quantized.w8a8" :"oneshot-a100x1"},

{"mistralai/Mistral-Small-3.1-24B-Instruct-2503" :"oneshot-a100x2"},
{"RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8" :"oneshot-a100x2"},
{"RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16" :"oneshot-a100x2"},
]
model_queue_list_dict = [
{"Qwen/Qwen3-0.6B" :"oneshot-a100x1"},
{"RedHatAI/Qwen3-0.6B-quantized.w4a16":"oneshot-a100x1"},
]

judgement_model_dict = {"model": "openai/gpt-oss-120b", "queue": "oneshot-a100x4" }

def run_task(model_queue_dict):
answer_model, _ = model_queue_dict.popitem()
judgement_model = judgement_model_dict["model"]
queue = judgement_model_dict["queue"]

task = ArenaHardJudgeTask(
project_name="jira_arenahard_judging",
task_name = f"judge_{answer_model.lower()}_task",
packages = ["huggingface-hub==0.34.3", "triton==3.3.1", "vllm==0.10.1.1"],
answer_project_name = "jira_arenahard_generation",
answer_task_name = f"generate_task_{answer_model.lower()}",
judgement_model = judgement_model,
#question_size = "small",
rate_type="throughput",
backend="aiohttp_server",
target="http://localhost:8000/v1",
bench_name = "arena-hard-v2.0",
branch = "arena_upgrade",
vllm_kwargs={"enable-chunked-prefill": True},
max_tokens = 16000,
)

task.execute_remotely(queue)

for model_queue_dict in model_queue_list_dict:
run_task(model_queue_dict)

#task.execute_locally()
22 changes: 22 additions & 0 deletions examples/judge_arenahard_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from automation.tasks import ArenaHardJudgeTask

task = ArenaHardJudgeTask(
project_name="gpt_arena_debug",
task_name="test_judge_task_1",
packages = ["huggingface-hub==0.34.3", "triton==3.3.1", "vllm==0.10.1.1"],
answer_task_name ="generate_math_task_gpt",
judgement_model ="Qwen/Qwen2-7B-Instruct",
question_size = "small",
rate_type="throughput",
backend="aiohttp_server",
target="http://localhost:8000/v1",
#bench_name = "arena-hard-v2.0",
bench_name = "arena-hard-v0.1",
#data="prompt_tokens=128,output_tokens=128",
branch = "arena_upgrade",
vllm_kwargs={"enable-chunked-prefill": True},
max_tokens = 1024,
)

task.execute_remotely("oneshot-a100x1")
#task.execute_locally()
Loading