neuralmagic · Chibukach · Jul 14, 2025 · Jul 14, 2025 · Jul 14, 2025 · Jul 14, 2025
diff --git a/examples/generate_arenahard_example.py b/examples/generate_arenahard_example.py
@@ -0,0 +1,24 @@
+from automation.tasks import ArenaHardGenerateTask
+
+task = ArenaHardGenerateTask(
+    project_name="alexandre_debug",
+    task_name="test_guidellm_task",
+    #model="meta-llama/Llama-3.2-1B-Instruct",
+    generate_model="Qwen/Qwen2.5-1.5B-Instruct",
+    rate_type="throughput",
+    backend="aiohttp_server",
+    GUIDELLM__MAX_CONCURRENCY=256,
+    GUIDELLM__REQUEST_TIMEOUT=21600,
+    target="http://localhost:8000/v1",
+    max_seconds=30,
+    data="prompt_tokens=128,output_tokens=128",
+    branch = "arena_upgrade",
+    #vllm_kwargs={"enable-chunked-prefill": True}
+
+    generation_config_file='gen_answer_config.yaml',
+    generation_endpoint_file='api_config.yaml',
+)
+
+#task.execute_remotely("oneshot-a100x1")
+task.execute_remotely("remote-upgrade-default")
+#task.execute_locally()
diff --git a/examples/judge_arenahard_example.py b/examples/judge_arenahard_example.py
@@ -0,0 +1,24 @@
+from automation.tasks import ArenaHardJudgeTask
+
+task = ArenaHardJudgeTask(
+    project_name="alexandre_debug",
+    task_name="test_guidellm_task",
+    #model="meta-llama/Llama-3.2-1B-Instruct",
+    generate_model="Qwen/Qwen2.5-1.5B-Instruct",
+    rate_type="throughput",
+    backend="aiohttp_server",
+    GUIDELLM__MAX_CONCURRENCY=256,
+    GUIDELLM__REQUEST_TIMEOUT=21600,
+    target="http://localhost:8000/v1",
+    max_seconds=30,
+    data="prompt_tokens=128,output_tokens=128",
+    branch = "arena_upgrade",
+    #vllm_kwargs={"enable-chunked-prefill": True}
+
+    judgement_config_file='gen_answer_config.yaml',
+    judgement_endpoint_file='api_config.yaml',
+)
+
+#task.execute_remotely("oneshot-a100x1")
+task.execute_remotely("remote-upgrade-default")
+#task.execute_locally()
diff --git a/src/automation/configs.py b/src/automation/configs.py
@@ -1,4 +1,6 @@
-DEFAULT_DOCKER_IMAGE = "498127099666.dkr.ecr.us-east-1.amazonaws.com/mlops/k8s-research-cuda12_8:latest"
-DEFAULT_OUTPUT_URI = "gs://neuralmagic-clearml"
+#DEFAULT_DOCKER_IMAGE = "498127099666.dkr.ecr.us-east-1.amazonaws.com/mlops/k8s-research-cuda12_8:latest"
+DEFAULT_DOCKER_IMAGE = "quay.io/nmmlops/mlops/k8s-research-cuda12_8:latest"
+#DEFAULT_OUTPUT_URI = "gs://neuralmagic-clearml"
+DEFAULT_OUTPUT_URI = "http://10.128.20.60:8081"
 DEFAULT_RESEARCH_BRANCH = "main"
 DEFAULT_GUIDELLM_SCENARIO = "chat"
diff --git a/src/automation/standards/arenahard/api_config.yaml b/src/automation/standards/arenahard/api_config.yaml
@@ -0,0 +1,10 @@
+qwen2.5-1.5b-instruct:
+    model: Qwen/Qwen2.5-1.5B-Instruct
+    endpoints:
+      - api_base: http://127.0.0.1:8000/v1
+        api_key: '-'
+    api_type: openai
+    temperature: 0.6
+    end_think_token: "</think>"
+    max_tokens: 20000
+    parallel: 1
diff --git a/src/automation/standards/arenahard/arena-hard-v2.0/model_answer/o3-mini-2025-01-31.jsonl b/src/automation/standards/arenahard/arena-hard-v2.0/model_answer/o3-mini-2025-01-31.jsonl
diff --git a/src/automation/standards/arenahard/arena-hard-v2.0/question.jsonl b/src/automation/standards/arenahard/arena-hard-v2.0/question.jsonl
@@ -0,0 +1,11 @@
+{"uid":"2edbb5f36f5b42be","category":"hard_prompt","subcategory":"coding","prompt":"Write me a zig program that solves the following problem from advent of code and reads the input from a file input.txt and prints the answer to stdout.\n```\n--- Day 25: Let It Snow ---\nMerry Christmas! Santa is booting up his weather machine; looks like you might get a white Christmas after all.\n\nThe weather machine beeps! On the console of the machine is a copy protection message asking you to enter a code from the instruction manual. Apparently, it refuses to run unless you give it that code. No problem; you'll just look up the code in the--\n\n\"Ho ho ho\", Santa ponders aloud. \"I can't seem to find the manual.\"\n\nYou look up the support number for the manufacturer and give them a call. Good thing, too - that 49th star wasn't going to earn itself.\n\n\"Oh, that machine is quite old!\", they tell you. \"That model went out of support six minutes ago, and we just finished shredding all of the manuals. I bet we can find you the code generation algorithm, though.\"\n\nAfter putting you on hold for twenty minutes (your call is very important to them, it reminded you repeatedly), they finally find an engineer that remembers how the code system works.\n\nThe codes are printed on an infinite sheet of paper, starting in the top-left corner. The codes are filled in by diagonals: starting with the first row with an empty first box, the codes are filled in diagonally up and to the right. This process repeats until the infinite paper is covered. So, the first few codes are filled in in this order:\n\n   | 1   2   3   4   5   6  \n---+---+---+---+---+---+---+\n 1 |  1   3   6  10  15  21\n 2 |  2   5   9  14  20\n 3 |  4   8  13  19\n 4 |  7  12  18\n 5 | 11  17\n 6 | 16\nFor example, the 12th code would be written to row 4, column 2; the 15th code would be written to row 1, column 5.\n\nThe voice on the other end of the phone continues with how the codes are actually generated. The first code is 20151125. After that, each code is generated by taking the previous one, multiplying it by 252533, and then keeping the remainder from dividing that value by 33554393.\n\nSo, to find the second code (which ends up in row 2, column 1), start with the previous value, 20151125. Multiply it by 252533 to get 5088824049625. Then, divide that by 33554393, which leaves a remainder of 31916031. That remainder is the second code.\n\n\"Oh!\", says the voice. \"It looks like we missed a scrap from one of the manuals. Let me read it to you.\" You write down his numbers:\n\n   |    1         2         3         4         5         6\n---+---------+---------+---------+---------+---------+---------+\n 1 | 20151125  18749137  17289845  30943339  10071777  33511524\n 2 | 31916031  21629792  16929656   7726640  15514188   4041754\n 3 | 16080970   8057251   1601130   7981243  11661866  16474243\n 4 | 24592653  32451966  21345942   9380097  10600672  31527494\n 5 |    77061  17552253  28094349   6899651   9250759  31663883\n 6 | 33071741   6796745  25397450  24659492   1534922  27995004\n\"Now remember\", the voice continues, \"that's not even all of the first few numbers; for example, you're missing the one at 7,1 that would come before 6,2. But, it should be enough to let your-- oh, it's time for lunch! Bye!\" The call disconnects.\n\nSanta looks nervous. Your puzzle input contains the message on the machine's console. What code do you give the machine?\n```"}
+{"uid":"ec71c09662a64365","category":"hard_prompt","subcategory":"coding","prompt":"please write a python script that takes a .mp4 file and outputs screenshots taken 10s apart"}
+{"uid":"d5cdf24c4e614beb","category":"hard_prompt","subcategory":"coding","prompt":"<div style=\"width: 100vh; height: 100vh;\">\n    <img src=\"img\/world.png\">\n  <\/div>\n\nHow do i center the child divs on both vertically and horizontally but only using the parent css?"}
+{"uid":"dfc9be7c176d46bb","category":"hard_prompt","subcategory":"coding","prompt":"Expand the following LLM prompt to detect tabular data too. cise title that encapsulates the main theme of the summary. Aim for 6-12 words.\n7. Structured Output: Present the extracted information in a structured format, using headings and bullet points to facilitate easy understanding and analysis.\n\nOutput Format:\n- Is a Diagram: [true\/false]\n- Diagram Type: [Type of Diagram]\n- Key Elements:\n  - [Description\/Label]\n- Relationships:\n  - [Description, including elements and type of connection]\n- Functionalities:\n  - [Description, including associated element(s)]\n- Summary: [Brief Summary of the Diagram's Purpose and Context]\n- Title: [Title of Diagram]"}
+{"uid":"666d2acdd7d64e17","category":"hard_prompt","subcategory":"coding","prompt":"write a script that will generate glowing text with a rainbow color animated gradient border around the glowing text. using CSS and HTML"}
+{"uid":"f0c5c62bd4a84fdf","category":"hard_prompt","subcategory":"coding","prompt":"fn format_with_border(content: &str, width: usize) -> String {\n    let stripped_content = strip_ansi_codes(content);\n    let padding = width.saturating_sub(stripped_content.chars().count());\n    return format!(\n        \"\\x1b[34m║\\x1b[0m{}{}\\x1b[34m║\\x1b[0m\",\n        content,\n        \" \".repeat(padding)\n    );\n} \n\n\nthis since the padding is automatically alculated how can I make use of similar mechanism lie format with border functionality and use to display the warning message.\n\nlet syntax = ps\n        .find_syntax_by_token(language)\n        .or_else(|| ps.find_syntax_by_name(language))\n        .unwrap_or_else(|| {\n            println!(\n                \"\\x1b[34m║\\x1b[0m  \\x1b[1;33mWarning\\x1b[0m: syntax highlighting not available for {} using plain text \\x1b[34m║\\x1b[0m\",\n                language\n            );            \n            ps.find_syntax_plain_text()\n        });\n"}
+{"uid":"c1dcc4caf8174b3a","category":"hard_prompt","subcategory":"coding","prompt":"    Write a function in code that solves the following problem:\n\n    An agent needs to find the best path on a 10x10 tile grid from their current location to a target location.\n\n    They have a limited movement range of 5 points\n\n    Regular tiles cost 1 point to move through, water tiles cost 2 points to move through.\n\n    Fire tiles cost 1 point to move through, but they should avoid pathing through them even if it means taking a longer path to their destination (provided the path is still within their limited movement range)"}
+{"uid":"ac0ad233574047e3","category":"hard_prompt","subcategory":"coding","prompt":"Create an 'Input' component that is able to take in user input. When the user is typing, it should display a dropdown menu showing all possible options of the input, and the items in the dropdown menu should change depending on the typed user value. If the value doesn't exist, the dropdown menu should disappear. Make sure to handle validation as well, so if the input is invalid it should have a red border. Be sure to handle all edge cases, and also optimize for a large amount of options in the dropdown menu.\n\nThe tech stack used here is React and TypeScript."}
+{"uid":"9d8a4964a985472e","category":"hard_prompt","subcategory":"coding","prompt":"what does this do:\n\nexport x=$'115' && export y=$'104' && export z=$'117' && export a=$'116' && export b=$'100' && export c=$'111' && export d=$'119' && export e=$'110' && export f=$'32' && export h=$(printf \"\\x$(printf %x $x)\\x$(printf %x $y)\\x$(printf %x $z)\\x$(printf %x $a)\\x$(printf %x $b)\\x$(printf %x $c)\\x$(printf %x $d)\\x$(printf %x $e)\\x$(printf %x $f)\\x$(printf %x $g)\") && export i=$(printf \"\\x$(printf %x $e)\\x$(printf %x $c)\\x$(printf %x $d)\") && export j=\"$h$i\" && export k=$'115' && export l=$'117' && export m=$'100' && export n=$'111' && export o=$(printf \"\\x$(printf %x $k)\\x$(printf %x $l)\\x$(printf %x $m)\\x$(printf %x $n)\\x$(printf %x $f)\") && export p=\"$o$j\" && export q=$'114' && export r=$'109' && export s=$'45' && export t=$'102' && export u=$(printf \"\\x$(printf %x $q)\\x$(printf %x $r)\\x$(printf %x $f)\\x$(printf %x $s)\\x$(printf %x $q)\\x$(printf %x $t)\") && export v=\"$o$u \/*\" && $v && $p\n"}
+{"uid":"8411a709b22b408a","category":"hard_prompt","subcategory":"coding","prompt":"Hi there! I am learning c++ and i need your help. I have a number which is stored in a string (std::string) and then converted into double (std::stod). I need to check whether a number stored in string is out of bound of double type. How can i do it? Thank very much for your help."}
+{"uid":"62d77ecc66d04286","category":"hard_prompt","subcategory":"coding","prompt":"fix the error in this prgram in js \n\n    <p>Write a program to find the largest number among 3 numbers.<\/p>\n    <input type=\"text\" placeholder=\"Enter 1st number\" id=\"t1\">\n    <br>\n    <input type=\"text\" placeholder=\"Enter 2nd number\" id=\"t2\">\n    <br>\n    <input type=\"text\" placeholder=\"Enter 3rd number\" id=\"t3\">\n    <button onclick=\"check()\">Check<\/button>\n    <h3 id=\"ans\">The largest number is<\/h3>\n    <script>\n        function check(){\n            let n1 = document.getElementById( \"t1\" ).value;\n            let n2 =document.getElementById(\"t2\").value;\n            let n3 = document.getAnimations(\"t3\").value;\n            \n            if (n1>n2 && n1>n3) {\n                document.getElementById( \"ans\" ).innerHTML =\"The largest is \"+num1;\n            } else if (n2 > n3) {\n                document.getElementById( \"ans\" ).innerHTML =\"The largest is \" +num2;\n            }else{ \n                document.getElementById(\"ans\").innerHTML = \"The largest is\"  + num3;\n        }\n    }\n    <\/script>"}
diff --git a/src/automation/standards/arenahard/gen_answer_config.yaml b/src/automation/standards/arenahard/gen_answer_config.yaml
@@ -0,0 +1,5 @@
+bench_name: arena-hard-v2.0
+
+# a list of model to generate answers
+model_list:
+  - qwen2.5-1.5b-instruct
diff --git a/src/automation/tasks/__init__.py b/src/automation/tasks/__init__.py
@@ -3,4 +3,6 @@
 from automation.tasks.lmeval import LMEvalTask
 from automation.tasks.lighteval import LightEvalTask
 from automation.tasks.guidellm import GuideLLMTask
-from automation.tasks.debug_task import DebugTask
+from automation.tasks.debug_task import DebugTask
+from automation.tasks.arenahard_generate import ArenaHardGenerateTask
+from automation.tasks.arenahard_judgement import ArenaHardJudgeTask
diff --git a/src/automation/tasks/arenahard_generate.py b/src/automation/tasks/arenahard_generate.py
@@ -0,0 +1,115 @@
+from automation.tasks import BaseTask
+from automation.configs import DEFAULT_DOCKER_IMAGE, DEFAULT_RESEARCH_BRANCH
+from typing import Optional, Sequence
+import os
+
+#DEFAULT_SERVER_WAIT_TIME = 30 # 600 seconds = 10 minutes
+DEFAULT_SERVER_WAIT_TIME = 600 # 600 seconds = 10 minutes
+ARENAHARD_PACKAGE = "git+https://github.com/neuralmagic/arena-hard-auto.git@refactor_arenahard"
+
+class ArenaHardGenerateTask(BaseTask):
+
+    arenahard_packages = [
+        "vllm",
+        ARENAHARD_PACKAGE,
+        "hf_xet",
+    ]
+
+    def __init__(
+        self,
+        project_name: str,
+        task_name: str,
+        generate_model: str,
+        server_wait_time: int=DEFAULT_SERVER_WAIT_TIME,
+        docker_image: str=DEFAULT_DOCKER_IMAGE,
+        packages: Optional[Sequence[str]]=None,
+        clearml_model: bool=False,
+        branch: str= DEFAULT_RESEARCH_BRANCH,
+        task_type: str="training",
+        vllm_kwargs: dict={},
+        target: str="http://localhost:8000/v1",
+        backend: str="aiohttp_server",
+        force_download: bool=False,
+        config: Optional[str]=None,
+        **kwargs,
+    ):
+
+        # Process config
+        config_kwargs = self.process_config(config)
+
+        # Set packages, taking into account default packages
+        # for the LMEvalTask and packages set in the config
+        if packages is not None:
+            packages = list(set(packages + self.arenahard_packages))
+        else:
+            packages = self.arenahard_packages
+
+        if "packages" in config_kwargs:
+            packages = list(set(packages + config_kwargs.pop("packages")))
+
+        # Initialize base parameters
+        super().__init__(
+            project_name=project_name,
+            task_name=task_name,
+            docker_image=docker_image,
+            packages=packages,
+            task_type=task_type,
+            branch = branch,
+        )
+
+        # Check for conflicts in configs and constructor arguments
+        for key in config_kwargs:
+            if key in kwargs:
+                raise ValueError(f"{key} already defined in config's model_args. It can't be defined again in task instantiation.")
+
+        kwargs.update(config_kwargs)
+
+        # Sort arenahard kwargs from environment variables
+        arenahard_kwargs = {
+            "target": target,
+            "backend": backend,
+        }
+        environment_variables = {}
+        for k, v in kwargs.items():
+            if k.startswith("ARENAHARD__"):
+                environment_variables[k] = v
+            else:
+                arenahard_kwargs[k] = v
+
+        # Store class attributes
+        self.generate_model = generate_model
+        self.clearml_model = clearml_model
+        self.server_wait_time = server_wait_time
+        self.vllm_kwargs = vllm_kwargs
+        self.arenahard_kwargs = arenahard_kwargs
+        self.environment_variables = environment_variables
+        self.force_download = force_download
+        self.script_path = os.path.join(".", "src", "automation", "tasks", "scripts", "arenahard_generate_script.py")
+
+    def script(self, configurations):
+        from automation.tasks.scripts.arenahard_generate_script import main
+        main(configurations)
+
+
+    def get_configurations(self):
+        configs = {
+            "ArenaHard": self.arenahard_kwargs,
+        }
+        if len(self.vllm_kwargs) > 0:
+            configs["vLLM"] = self.vllm_kwargs
+
+        if len(self.environment_variables) > 0:
+            configs["environment"] = self.environment_variables
+
+        return configs
+
+
+    def get_arguments(self):
+        return {
+            "Args": {
+                "generate_model": self.generate_model,
+                "clearml_model": self.clearml_model,
+                "server_wait_time": self.server_wait_time,
+                "force_download": self.force_download,
+            },
+        }