neuralmagic · Chibukach · Jul 14, 2025 · Jul 14, 2025 · Jul 14, 2025 · Jul 14, 2025
diff --git a/examples/arenahard_pipeline.py b/examples/arenahard_pipeline.py
@@ -0,0 +1,75 @@
+from automation.pipelines import Pipeline
+from automation.tasks import ArenaHardGenerateTask, ArenaHardJudgeTask
+
+arena_hard_project_name="dsmall_pipeline_debug"
+step1 = ArenaHardGenerateTask(
+    project_name= arena_hard_project_name,
+    task_name="generate_pipeline_task1",
+    packages = ["huggingface-hub==0.34.3", "triton==3.3.1","vllm==0.10.1.1"],
+    generate_model="meta-llama/Llama-3.2-1B-Instruct",
+    question_size = "small",
+    rate_type="throughput",
+    backend="aiohttp_server",
+    target="http://localhost:8000/v1",
+    server_wait_time = 600, 
+    bench_name = "arena-hard-v2.0",
+    #bench_name = "arena-hard-v0.1",
+    branch = "arena_upgrade",
+    vllm_kwargs={"enable-chunked-prefill": True},
+    max_tokens = 1024, 
+)
+
+step1.create_task()
+
+
+step2 = ArenaHardJudgeTask(
+    project_name= arena_hard_project_name,
+    task_name="judge_pipeline_task1",
+    packages = ["huggingface-hub==0.34.3", "triton==3.3.1","vllm==0.10.1.1"],
+    #judgement_model ="Qwen/Qwen2-7B-Instruct",
+    judgement_model = "openai/gpt-oss-120b",
+    question_size = "small",
+    rate_type="throughput",
+    backend="aiohttp_server",
+    target="http://localhost:8000/v1",
+    server_wait_time = 600, 
+    bench_name = "arena-hard-v2.0",
+    #bench_name = "arena-hard-v0.1",
+    branch = "arena_upgrade",
+    vllm_kwargs={"enable-chunked-prefill": True},
+    max_tokens = 1024, 
+)
+
+step2.create_task()
+
+pipeline = Pipeline(
+    project_name= arena_hard_project_name,
+    pipeline_name="pipeline_arenahard",
+)
+
+
+pipeline.add_step(
+    name="pipeline_arenahard_gen_step_1",
+    base_task_id = step1.id,
+    execution_queue="oneshot-a100x1",
+    #monitor_models=[step1.get_arguments()["Args"]["save_directory"]],
+    #monitor_artifacts=["recipe"],
+)
+
+
+
+config_override = {**step1.get_configurations()['ArenaHard'], **{"answer_project_name": arena_hard_project_name}, **{"answer_task_name" : pipeline.steps[0][1]['name'] }}
+
+pipeline.add_step(
+    name="pipeline_arenahard_judge_step_2",
+    base_task_id = step2.id,
+    parents=["pipeline_arenahard_gen_step_1"],
+    execution_queue="oneshot-a100x4",
+    #parameter_override={"Args/model_id": "${pipeline_arenahard_generate_step1.models.output.-1.id}"},
+    configuration_overrides={"ArenaHard" : config_override },
+    #monitor_metrics=[("gsm8k", "exact_match,strict-match")],
+)
+
+
+pipeline.execute_remotely()
+#pipeline.execute_remotely("oneshot-a100x1")
diff --git a/examples/generate_arenahard_example.py b/examples/generate_arenahard_example.py
@@ -0,0 +1,21 @@
+from automation.tasks import ArenaHardGenerateTask
+
+task = ArenaHardGenerateTask(
+    project_name="gpt_arena_debug",
+    task_name="generate_math_task_gpt",
+    packages = ["huggingface-hub==0.34.3", "triton==3.3.1", "vllm==0.10.1.1"],
+    generate_model= "RedHatAI/Qwen2.5-0.5B-Instruct-quantized.w8a8",
+    #generate_model= "openai/gpt-oss-120b",
+    question_size = "small",
+    rate_type="throughput",
+    backend="aiohttp_server",
+    target="http://localhost:8000/v1",
+    bench_name = "arena-hard-v2.0",
+    #bench_name = "arena-hard-v0.1",
+    branch = "arena_upgrade",
+    vllm_kwargs={"enable-chunked-prefill": True},
+    max_tokens = 1024, 
+)
+
+task.execute_remotely("oneshot-a100x1")
+#task.execute_locally()
diff --git a/examples/jira_arenahard.py b/examples/jira_arenahard.py
@@ -0,0 +1,20 @@
+from automation.tasks import ArenaHardGenerateTask
+
+task = ArenaHardGenerateTask(
+    project_name="simple_debug",
+    task_name="generate_math_task_gpt",
+    packages = ["huggingface-hub==0.34.3", "triton==3.3.1","vllm==0.10.0"],
+    generate_model="openai/gpt-oss-120b",
+    question_size = "small",
+    rate_type="throughput",
+    backend="aiohttp_server",
+    target="http://localhost:8000/v1",
+    data="prompt_tokens=128,output_tokens=128",
+    branch = "arena_upgrade",
+    vllm_kwargs={"enable-chunked-prefill": True},
+    max_tokens = 1300, 
+
+)
+
+task.execute_remotely("oneshot-a100x4")
+#task.execute_locally()
diff --git a/examples/jira_arenahard_generation.py b/examples/jira_arenahard_generation.py
@@ -0,0 +1,106 @@
+from automation.tasks import ArenaHardGenerateTask
+
+model_queue_list_dict = [
+    {"Qwen/Qwen3-0.6B" :"oneshot-a100x1"},
+    {"RedHatAI/Qwen3-0.6B-quantized.w4a16":"oneshot-a100x1"},
+
+    {"Qwen/Qwen3-1.7B":"oneshot-a100x1"},
+    {"RedHatAI/Qwen3-1.7B-quantized.w4a16":"oneshot-a100x1"},
+
+    {"Qwen/Qwen3-4B":"oneshot-a100x1"},
+    {"RedHatAI/Qwen3-4B-quantized.w4a16":"oneshot-a100x1"},
+
+    {"Qwen/Qwen3-8B":"oneshot-a100x1"},
+    {"RedHatAI/Qwen3-8B-quantized.w4a16":"oneshot-a100x1"},
+
+    {"Qwen/Qwen3-14B":"oneshot-a100x1"},
+    {"RedHatAI/Qwen3-14B-quantized.w4a16":"oneshot-a100x1"},
+
+    {"Qwen/Qwen3-32B":"oneshot-a100x2"},
+    {"RedHatAI/Qwen3-32B-quantized.w4a16":"oneshot-a100x2"},
+
+    {"Qwen/Qwen3-30B-A3B":"oneshot-a100x2"},
+    {"RedHatAI/Qwen3-30B-A3B-quantized.w4a16":"oneshot-a100x2"},
+
+    {"meta-llama/Llama-3.1-8B-Instruct":"oneshot-a100x1"},
+    {"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16":"oneshot-a100x1"},
+    {"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8":"oneshot-a100x1"},
+
+    {"meta-llama/Llama-3.2-1B-Instruct":"oneshot-a100x1"},
+    {"RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8":"oneshot-a100x1"},
+
+    {"meta-llama/Llama-3.2-3B-Instruct":"oneshot-a100x1"},
+    {"RedHatAI/Llama-3.2-3B-Instruct-quantized.w8a8":"oneshot-a100x1"},
+
+    {"meta-llama/Llama-3.3-70B-Instruct": "oneshot-a100x4"},
+    {"RedHatAI/Llama-3.3-70B-Instruct-quantized.w4a16": "oneshot-a100x4"},
+    {"RedHatAI/Llama-3.3-70B-Instruct-quantized.w8a8": "oneshot-a100x4"},
+
+    {"meta-llama/Llama-4-Scout-17B-16E": "oneshot-a100x8"},
+    {"RedHatAI/Llama-4-Scout-17B-16E-Instruct-quantized.w4a16": "oneshot-a100x8"},
+
+    {"microsoft/phi-4" :"oneshot-a100x1"},
+    {"RedHatAI/phi-4-quantized.w4a16" :"oneshot-a100x1"},
+    {"RedHatAI/phi-4-quantized.w8a8" :"oneshot-a100x1"},
+
+    {"mistralai/Mistral-Small-3.1-24B-Instruct-2503" :"oneshot-a100x2"},
+    {"RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8" :"oneshot-a100x2"},
+    {"RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16" :"oneshot-a100x2"},
+]
+
+
+
+
+model_queue_list_dict = [
+    {"meta-llama/Llama-3.3-70B-Instruct": "oneshot-a100x4"},
+    {"meta-llama/Llama-3.1-8B-Instruct":"oneshot-a100x1"},
+    {"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16":"oneshot-a100x1"},
+    {"RedHatAI/Qwen3-0.6B-quantized.w4a16":"oneshot-a100x1"},
+    {"RedHatAI/Qwen3-1.7B-quantized.w4a16":"oneshot-a100x1"},
+]
+
+model_queue_list_dict = [
+    #{"meta-llama/Llama-4-Scout-17B-16E": "oneshot-a100x8"},
+    {"Qwen/Qwen3-8B":"oneshot-a100x1"},
+]
+
+def run_task(model_queue_dict):
+    model, queue =  model_queue_dict.popitem()
+    project_name="jira_arenahard_generation"
+
+    if "Scout" in model:
+        task = ArenaHardGenerateTask(
+            project_name=project_name,
+            task_name= f"generate_task_{model.lower()}",
+            packages = ["huggingface-hub==0.34.3", "triton==3.3.1", "vllm==0.10.1.1"],
+            generate_model= model,
+            rate_type="throughput",
+            backend="aiohttp_server",
+            target="http://localhost:8000/v1",
+            branch = "arena_upgrade",
+            vllm_kwargs={"enable-chunked-prefill": True, "max-model-len": 60000},
+            max_tokens = 16000, 
+            bench_name = "arena-hard-v2.0"
+        )
+
+    else:
+        task = ArenaHardGenerateTask(
+            project_name=project_name,
+            task_name= f"generate_task_{model.lower()}",
+            packages = ["huggingface-hub==0.34.3", "triton==3.3.1", "vllm==0.10.1.1"],
+            generate_model= model,
+            rate_type="throughput",
+            backend="aiohttp_server",
+            target="http://localhost:8000/v1",
+            branch = "arena_upgrade",
+            vllm_kwargs={"enable-chunked-prefill": True },
+            max_tokens = 16000, 
+            bench_name = "arena-hard-v2.0"
+        )
+
+    task.execute_remotely(queue)
+
+for model_queue_dict in model_queue_list_dict:
+    run_task(model_queue_dict)
+
+#task.execute_locally()
diff --git a/examples/jira_arenahard_judging.py b/examples/jira_arenahard_judging.py
@@ -0,0 +1,139 @@
+from automation.tasks import ArenaHardJudgeTask
+
+model_queue_list_dict = [
+    {"Qwen/Qwen3-0.6B" :"oneshot-a100x1"},
+    {"RedHatAI/Qwen3-0.6B-quantized.w4a16":"oneshot-a100x1"},
+
+    {"Qwen/Qwen3-1.7B":"oneshot-a100x1"},
+    {"RedHatAI/Qwen3-1.7B-quantized.w4a16":"oneshot-a100x1"},
+
+    {"Qwen/Qwen3-4B":"oneshot-a100x1"},
+    {"RedHatAI/Qwen3-4B-quantized.w4a16":"oneshot-a100x1"},
+
+    {"Qwen/Qwen3-8B":"oneshot-a100x1"},
+    {"RedHatAI/Qwen3-8B-quantized.w4a16":"oneshot-a100x1"},
+
+    {"Qwen/Qwen3-14B":"oneshot-a100x1"},
+    {"RedHatAI/Qwen3-14B-quantized.w4a16":"oneshot-a100x1"},
+
+    {"Qwen/Qwen3-32B":"oneshot-a100x2"},
+    {"RedHatAI/Qwen3-32B-quantized.w4a16":"oneshot-a100x2"},
+
+    {"Qwen/Qwen3-30B-A3B":"oneshot-a100x2"},
+    {"RedHatAI/Qwen3-30B-A3B-quantized.w4a16":"oneshot-a100x2"},
+
+    {"meta-llama/Llama-3.1-8B-Instruct":"oneshot-a100x1"},
+    {"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16":"oneshot-a100x1"},
+    {"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8":"oneshot-a100x1"},
+
+    {"meta-llama/Llama-3.2-1B-Instruct":"oneshot-a100x1"},
+    {"RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8":"oneshot-a100x1"},
+
+    {"meta-llama/Llama-3.2-3B-Instruct":"oneshot-a100x1"},
+    {"RedHatAI/Llama-3.2-3B-Instruct-quantized.w8a8":"oneshot-a100x1"},
+
+    {"meta-llama/Llama-3.3-70B-Instruct": "oneshot-a100x4"},
+    {"RedHatAI/Llama-3.3-70B-Instruct-quantized.w4a16": "oneshot-a100x4"},
+    {"RedHatAI/Llama-3.3-70B-Instruct-quantized.w8a8": "oneshot-a100x4"},
+
+    {"meta-llama/Llama-4-Scout-17B-16E": "oneshot-a100x8"},
+    {"RedHatAI/Llama-4-Scout-17B-16E-Instruct-quantized.w4a16": "oneshot-a100x8"},
+
+    {"microsoft/phi-4" :"oneshot-a100x1"},
+    {"RedHatAI/phi-4-quantized.w4a16" :"oneshot-a100x1"},
+    {"RedHatAI/phi-4-quantized.w8a8" :"oneshot-a100x1"},
+
+    {"mistralai/Mistral-Small-3.1-24B-Instruct-2503" :"oneshot-a100x2"},
+    {"RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8" :"oneshot-a100x2"},
+    {"RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16" :"oneshot-a100x2"},
+]
+
+
+
+
+
+
+# TODO:
+model_queue_list_dict = [
+    {"RedHatAI/Qwen3-1.7B-quantized.w4a16":"oneshot-a100x1"},
+    {"Qwen/Qwen3-8B":"oneshot-a100x1"},
+    {"Qwen/Qwen3-14B":"oneshot-a100x1"},
+    {"Qwen/Qwen3-32B":"oneshot-a100x2"},
+    {"RedHatAI/Qwen3-32B-quantized.w4a16":"oneshot-a100x2"},
+    {"meta-llama/Llama-3.3-70B-Instruct": "oneshot-a100x4"},
+    {"RedHatAI/Llama-3.3-70B-Instruct-quantized.w4a16": "oneshot-a100x4"},
+    {"meta-llama/Llama-4-Scout-17B-16E": "oneshot-a100x8"},
+    {"RedHatAI/Llama-4-Scout-17B-16E-Instruct-quantized.w4a16": "oneshot-a100x8"},
+]
+
+model_queue_list_dict = [
+    #{"Qwen/Qwen3-0.6B" :"oneshot-a100x1"},
+    #{"RedHatAI/Qwen3-0.6B-quantized.w4a16":"oneshot-a100x1"},
+
+    {"Qwen/Qwen3-1.7B":"oneshot-a100x1"},
+
+    {"Qwen/Qwen3-4B":"oneshot-a100x1"},
+    {"RedHatAI/Qwen3-4B-quantized.w4a16":"oneshot-a100x1"},
+
+    {"RedHatAI/Qwen3-8B-quantized.w4a16":"oneshot-a100x1"},
+
+    {"RedHatAI/Qwen3-14B-quantized.w4a16":"oneshot-a100x1"},
+
+    {"Qwen/Qwen3-30B-A3B":"oneshot-a100x2"},
+    {"RedHatAI/Qwen3-30B-A3B-quantized.w4a16":"oneshot-a100x2"},
+
+    {"meta-llama/Llama-3.1-8B-Instruct":"oneshot-a100x1"},
+    {"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16":"oneshot-a100x1"},
+    {"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8":"oneshot-a100x1"},
+
+    {"meta-llama/Llama-3.2-1B-Instruct":"oneshot-a100x1"},
+    {"RedHatAI/Llama-3.2-1B-Instruct-quantized.w8a8":"oneshot-a100x1"},
+
+    {"meta-llama/Llama-3.2-3B-Instruct":"oneshot-a100x1"},
+    {"RedHatAI/Llama-3.2-3B-Instruct-quantized.w8a8":"oneshot-a100x1"},
+
+    {"RedHatAI/Llama-3.3-70B-Instruct-quantized.w8a8": "oneshot-a100x4"},
+
+    {"microsoft/phi-4" :"oneshot-a100x1"},
+    {"RedHatAI/phi-4-quantized.w4a16" :"oneshot-a100x1"},
+    {"RedHatAI/phi-4-quantized.w8a8" :"oneshot-a100x1"},
+
+    {"mistralai/Mistral-Small-3.1-24B-Instruct-2503" :"oneshot-a100x2"},
+    {"RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8" :"oneshot-a100x2"},
+    {"RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w4a16" :"oneshot-a100x2"},
+]
+model_queue_list_dict = [
+    {"Qwen/Qwen3-0.6B" :"oneshot-a100x1"},
+    {"RedHatAI/Qwen3-0.6B-quantized.w4a16":"oneshot-a100x1"},
+]
+
+judgement_model_dict = {"model": "openai/gpt-oss-120b", "queue": "oneshot-a100x4" }
+
+def run_task(model_queue_dict):
+    answer_model, _ =  model_queue_dict.popitem()
+    judgement_model = judgement_model_dict["model"]
+    queue = judgement_model_dict["queue"]
+
+    task = ArenaHardJudgeTask(
+        project_name="jira_arenahard_judging",
+        task_name = f"judge_{answer_model.lower()}_task",
+        packages = ["huggingface-hub==0.34.3", "triton==3.3.1", "vllm==0.10.1.1"],
+        answer_project_name = "jira_arenahard_generation",
+        answer_task_name = f"generate_task_{answer_model.lower()}",
+        judgement_model = judgement_model,
+        #question_size = "small",
+        rate_type="throughput",
+        backend="aiohttp_server",
+        target="http://localhost:8000/v1",
+        bench_name = "arena-hard-v2.0",
+        branch = "arena_upgrade",
+        vllm_kwargs={"enable-chunked-prefill": True},
+        max_tokens = 16000, 
+    )
+
+    task.execute_remotely(queue)
+
+for model_queue_dict in model_queue_list_dict:
+    run_task(model_queue_dict)
+
+#task.execute_locally()
diff --git a/examples/judge_arenahard_example.py b/examples/judge_arenahard_example.py
@@ -0,0 +1,22 @@
+from automation.tasks import ArenaHardJudgeTask
+
+task = ArenaHardJudgeTask(
+    project_name="gpt_arena_debug",
+    task_name="test_judge_task_1",
+    packages = ["huggingface-hub==0.34.3", "triton==3.3.1", "vllm==0.10.1.1"],
+    answer_task_name ="generate_math_task_gpt",
+    judgement_model ="Qwen/Qwen2-7B-Instruct",
+    question_size = "small",
+    rate_type="throughput",
+    backend="aiohttp_server",
+    target="http://localhost:8000/v1",
+    #bench_name = "arena-hard-v2.0",
+    bench_name = "arena-hard-v0.1",
+    #data="prompt_tokens=128,output_tokens=128",
+    branch = "arena_upgrade",
+    vllm_kwargs={"enable-chunked-prefill": True},
+    max_tokens = 1024, 
+)
+
+task.execute_remotely("oneshot-a100x1")
+#task.execute_locally()