Skip to content
Open
Show file tree
Hide file tree
Changes from 46 commits
Commits
Show all changes
237 commits
Select commit Hold shift + click to select a range
76400b9
test base pip install
Jul 14, 2025
f7b6a38
update config
Jul 14, 2025
963f389
add config files
Jul 14, 2025
297ca4e
fix circular import
Jul 14, 2025
4b7d476
readd path
Jul 14, 2025
bb24bb4
add sitepackages path
Jul 14, 2025
7f76944
removed naming conflict
Jul 14, 2025
68989e8
add files
Jul 14, 2025
58bb6da
remove arena import
Jul 14, 2025
c093d75
update generation import
Jul 14, 2025
3e0cb11
in python entrypoint
Jul 14, 2025
3cbf094
remove util in script
Jul 14, 2025
6c8004f
add path
Jul 14, 2025
c471ab5
test path
Jul 14, 2025
2254d61
test path
Jul 14, 2025
3390a3c
readd python path
Jul 14, 2025
5dfc8ae
direct function call
Jul 14, 2025
1c0c6c1
moved run
Jul 14, 2025
9d8acfe
readd module path
Jul 14, 2025
51a411b
moved start gen
Jul 14, 2025
5741c11
remove path
Jul 14, 2025
d4375b4
remove path
Jul 14, 2025
69efa9e
add python path
Jul 14, 2025
028d408
move run to scripts
Jul 14, 2025
5d07916
removed start_gen
Jul 14, 2025
6030405
moved pathlib
Jul 14, 2025
6355837
update path
Jul 14, 2025
ca18e29
update path
Jul 14, 2025
3aaeef0
update path
Jul 14, 2025
a7f362a
move run
Jul 14, 2025
c08bc43
add python path
Jul 14, 2025
c1c0a09
update python path
Jul 14, 2025
0d6190a
update path
Jul 14, 2025
31f8070
add site package to path
Jul 14, 2025
7c9cc07
update script path name
Jul 14, 2025
eb583a5
fix config path
Jul 14, 2025
d9852d7
after vllm
Jul 14, 2025
ce65017
clean up
Jul 15, 2025
f101255
rename to generate
Jul 15, 2025
b5ccd3f
reduce questions
Jul 15, 2025
869cff9
clean up generation
Jul 15, 2025
cb34022
update config dictionary name
Jul 15, 2025
c3e4f9f
clean up file paths
Jul 15, 2025
0ecf435
moved based path to top of script
Jul 15, 2025
c9abe09
base judge using answer
Jul 15, 2025
c12ecae
update to judgement
Jul 15, 2025
1b0ab9c
generation to judgement
Jul 16, 2025
1d13a0f
missing answer file
Jul 16, 2025
c0934de
add arenahard yaml
Jul 16, 2025
25d7312
read from gen judgement
Jul 16, 2025
37559ed
update to use artifact
Jul 16, 2025
9578493
update to reference different task
Jul 16, 2025
5a42e67
debug file location
Jul 16, 2025
951f431
add pathlib
Jul 16, 2025
e6d0b66
updated answer dir
Jul 16, 2025
23cb3a0
update output path for generation
Jul 16, 2025
5224491
add answer data
Jul 16, 2025
42ba3ef
readd gen
Jul 16, 2025
56e603a
add pathlib
Jul 16, 2025
e548e58
update judgment script to use now gen
Jul 16, 2025
62bc4f6
clean print
Jul 16, 2025
e52f7a1
readd os import
Jul 16, 2025
00cf20f
fix directory definitions
Jul 16, 2025
274398b
readd print
Jul 16, 2025
321827c
update dir for answer
Jul 16, 2025
886aa25
test output path
Jul 16, 2025
9858723
revert output
Jul 16, 2025
636148a
include json dump
Jul 16, 2025
79c1031
revert dump
Jul 17, 2025
421c684
change output
Jul 17, 2025
09e4f52
final gen
Jul 17, 2025
c4b737b
update judge to use the generate
Jul 17, 2025
174c341
use task id for judgement
Jul 17, 2025
397c950
update task to point to judgement model
Jul 17, 2025
6cc02f6
test with new model
Jul 18, 2025
75f85f5
updated max completion tokens
Jul 18, 2025
39b967e
test judgement with new model
Jul 18, 2025
7bfa7fb
if there's a taskid provided to judgement
Jul 18, 2025
54a5037
fix dict indexing
Jul 18, 2025
326fb48
reference yaml bench name
Jul 18, 2025
bd032b8
update generate to store based on bench name
Jul 18, 2025
53b0d7b
update model name for file output
Jul 18, 2025
5c47140
update judgement name
Jul 18, 2025
d6779b7
reference the answer model
Jul 18, 2025
073067f
add answer example json
Jul 18, 2025
f1fe759
reduced math tokens
Jul 18, 2025
c7199b4
update hyperparameters from config
Jul 22, 2025
18a01d1
revert to config
Jul 23, 2025
820fad6
use url
Jul 23, 2025
ea60e4b
test bucket
Jul 25, 2025
c4fd359
update config to use bucket
Jul 28, 2025
0530d1c
pin vllm version
Jul 28, 2025
1802a12
pin hf hub
Jul 28, 2025
c585437
pin older vllm version
Jul 29, 2025
e033ecb
update vllm
Jul 29, 2025
987cf62
vllm so logs show
Jul 29, 2025
c90cabe
lm eval vllm change
Jul 29, 2025
dd4bd9a
pin transformers version
Jul 29, 2025
2325637
hf hub
Jul 29, 2025
404d3a1
pinned transformers and latest vllm
Jul 29, 2025
f3e14b9
moved pins
Jul 29, 2025
0c12ec4
pin vllm
Jul 29, 2025
2ea32fb
revert back to vllm logging to txt file
Jul 29, 2025
3d9b897
revert to main vllm server script
Jul 29, 2025
81a4fe5
revert to main vllm server script
Jul 29, 2025
4a8d64e
vllm log to shell
Jul 31, 2025
288595b
update vllm
Jul 31, 2025
68b2bf9
add 0.10.0
Jul 31, 2025
d2e1d6b
back to vllm text log
Jul 31, 2025
a8feda3
transformers 4.53 and vllm 0.9.1
Aug 1, 2025
4fad19b
added delay
Aug 1, 2025
510e40d
remove delay
Aug 1, 2025
7238484
remove vllm text log
Aug 4, 2025
11b92f2
revert to working state
Aug 5, 2025
125c37b
pin triton
Aug 12, 2025
e207420
update config to match main
Aug 14, 2025
5fe12d0
updated libs for judgement
Aug 14, 2025
025f7f1
removed gpu count
Aug 14, 2025
70faf4f
updated base task to take pinned library versions
Aug 18, 2025
d5d4a32
clean up base task
Aug 18, 2025
99823d1
update to use previous task
Aug 18, 2025
05eb933
removed local path line
Aug 18, 2025
e13fdc2
use jinja
Aug 18, 2025
8c6fa31
update src with template changes
Aug 18, 2025
c650193
updated standards path
Aug 18, 2025
bb185be
update path to same yamls
Aug 18, 2025
a79f4a1
update paths
Aug 18, 2025
a064d1d
test path to standards
Aug 18, 2025
646fa6d
update path
Aug 18, 2025
b36431c
moved file dir save
Aug 18, 2025
2c01dbc
regenerate yaml
Aug 19, 2025
8a6a14c
update api config path
Aug 19, 2025
b612fbd
config gen path
Aug 19, 2025
2626ddd
added questions
Aug 19, 2025
9fe2c17
add tmp config files
Aug 19, 2025
a69581e
changed max token
Aug 19, 2025
f84859f
reduce tokens
Aug 19, 2025
723fe7c
moved all tmp files
Aug 19, 2025
1758bfc
rename tmp configs
Aug 19, 2025
a67c02e
input max tokens
Aug 19, 2025
e979512
clean up
Aug 19, 2025
96c8172
add parameterised judgement
Aug 19, 2025
638852e
import render yaml
Aug 19, 2025
580201f
use existing task
Aug 19, 2025
97ac6a7
update task parameters
Aug 19, 2025
4e70554
use current project name
Aug 19, 2025
7122e2a
fix syntax error
Aug 19, 2025
1cfd1ad
add support for default model as 03
Aug 19, 2025
e575b57
copy mini 03
Aug 19, 2025
5baacf5
add default queue to pipeline
Aug 19, 2025
0d70f03
remove base pipleine mod
Aug 19, 2025
b1b46dc
Merge branch 'main' into arena_upgrade
anmarques Aug 19, 2025
1e28932
full arenahard questions
Aug 20, 2025
8dbcf6e
updated example workflows
Aug 20, 2025
b86dcda
Merge remote-tracking branch 'refs/remotes/origin/arena_upgrade' into…
Aug 20, 2025
33ed070
allow use of short questions
Aug 20, 2025
d9a68d4
add category support to generate
Aug 20, 2025
6753f06
added all the model answers
Aug 20, 2025
4d6e305
debug category
Aug 20, 2025
4e7b8c3
update dict
Aug 20, 2025
258afee
retrieve dict value
Aug 20, 2025
a908dbf
changed to quesiton size
Aug 20, 2025
941484a
cleanup question size
Aug 20, 2025
43beabb
update judgement
Aug 20, 2025
ea6d183
remove hardcoded yamls
Aug 20, 2025
efc0cde
added arenahard examples
Aug 20, 2025
cf7e51a
refactor v2
Sep 17, 2025
bc02e9a
reference bench name correctly
Sep 17, 2025
144a1c9
add v0.1 templates
Sep 17, 2025
0413993
update judge to refactor v2
Sep 17, 2025
496034a
update to use judgement args
Sep 17, 2025
7f4a41c
updated examples with v0.1 support
Sep 17, 2025
64a9366
log vllm to file
Sep 24, 2025
ed8acf6
update tmp_arenahard_file name
Sep 25, 2025
1e43fbb
add arenahard examples
Sep 26, 2025
2c16d1a
add refactoring script
Sep 26, 2025
28dbf0d
used generic api base
Oct 3, 2025
1c24dd7
update target string
Oct 3, 2025
d784aa1
fix fstring error
Oct 6, 2025
3de5355
show logs
Oct 6, 2025
19e846d
reference judgement
Oct 6, 2025
08eb26c
update the generate task query mechanism
Oct 6, 2025
e88a6b7
fix syntax error
Oct 6, 2025
641f0ca
don't start vllm server for gemini model
Oct 6, 2025
0577e7c
add support for api type
Oct 6, 2025
b516df2
increase parallel
Oct 6, 2025
60b42da
added gemini env var
Oct 7, 2025
b523c6a
test
Oct 7, 2025
d7544b1
comment out log
Oct 7, 2025
e1469ef
run vllm if api key is present
Oct 7, 2025
df1ff1d
right model name
Oct 7, 2025
3ae5d84
clearer artifact name
Oct 7, 2025
c48330e
add default answers
Oct 7, 2025
db01641
print args
Oct 7, 2025
06cbf93
tmp assertions
Oct 7, 2025
1c15c67
hard code id
Oct 7, 2025
d43cffe
update output path to use judgement
Oct 7, 2025
5ca6405
get judgment results
Oct 7, 2025
647bc14
added api_type
Oct 8, 2025
730a777
readd api_type
Oct 8, 2025
de11ceb
add conditional temp
Oct 8, 2025
40e7473
add conditional temp
Oct 8, 2025
ebbfa21
remove asserts
Oct 8, 2025
ff4c0ae
lower case model
Oct 8, 2025
6c71eb8
updated hardcoded task id
Oct 8, 2025
759027f
updated hardcoded task id
Oct 8, 2025
f6594b1
tmp move model answer
Oct 8, 2025
900fcae
fix syntax error
Oct 8, 2025
b6af8a3
hardcode answer path
Oct 8, 2025
65a915a
add make dir
Oct 8, 2025
9d3d55e
fix syntax error
Oct 8, 2025
9699515
default make dir
Oct 8, 2025
c043b6d
default make dirs
Oct 8, 2025
95bd97b
default makedirs
Oct 8, 2025
288fe9b
change judge name
Oct 8, 2025
f5f0b8d
reference correct model to judge
Oct 8, 2025
dea71df
update template arenahards
Oct 8, 2025
df38068
rename answer model file saving name
Oct 8, 2025
92746a2
simplify rename answer model file saving name
Oct 8, 2025
b521083
move pull answers
Oct 8, 2025
64e330f
improve log
Oct 8, 2025
d352ab8
fix order
Oct 8, 2025
a66d7c9
retrieve answer_model var
Oct 8, 2025
b60ad8e
updated output file name
Oct 8, 2025
9b95346
artifact object print
Oct 8, 2025
5d9656f
add print
Oct 8, 2025
c756947
ensure directory exists before copy
Oct 8, 2025
d16165d
readd model answer
Oct 8, 2025
a17fef5
full questions
Oct 8, 2025
bd13d4f
change output
Oct 8, 2025
81d6314
output after storage manager pull
Oct 8, 2025
acfcc58
clean up output
Oct 8, 2025
ad374fb
re-add output path
Oct 8, 2025
8afb240
full questions
Oct 8, 2025
c788a96
added temperature
Oct 8, 2025
f3057ab
remove hardcode
Oct 8, 2025
2c3f978
get task via taskid
Oct 8, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions examples/generate_arenahard_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from automation.tasks import ArenaHardGenerateTask

task = ArenaHardGenerateTask(
project_name="alexandre_debug",
task_name="test_guidellm_task",
#model="meta-llama/Llama-3.2-1B-Instruct",
generate_model="Qwen/Qwen2.5-1.5B-Instruct",
rate_type="throughput",
backend="aiohttp_server",
GUIDELLM__MAX_CONCURRENCY=256,
GUIDELLM__REQUEST_TIMEOUT=21600,
target="http://localhost:8000/v1",
max_seconds=30,
data="prompt_tokens=128,output_tokens=128",
branch = "arena_upgrade",
#vllm_kwargs={"enable-chunked-prefill": True}

generation_config_file='gen_answer_config.yaml',
generation_endpoint_file='api_config.yaml',
)

#task.execute_remotely("oneshot-a100x1")
task.execute_remotely("remote-upgrade-default")
#task.execute_locally()
24 changes: 24 additions & 0 deletions examples/judge_arenahard_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from automation.tasks import ArenaHardJudgeTask

task = ArenaHardJudgeTask(
project_name="alexandre_debug",
task_name="test_guidellm_task",
#model="meta-llama/Llama-3.2-1B-Instruct",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are the commented lines needed? this is also a question for the other lines in the other files.

generate_model="Qwen/Qwen2.5-1.5B-Instruct",
rate_type="throughput",
backend="aiohttp_server",
GUIDELLM__MAX_CONCURRENCY=256,
GUIDELLM__REQUEST_TIMEOUT=21600,
target="http://localhost:8000/v1",
max_seconds=30,
data="prompt_tokens=128,output_tokens=128",
branch = "arena_upgrade",
#vllm_kwargs={"enable-chunked-prefill": True}

judgement_config_file='gen_answer_config.yaml',
judgement_endpoint_file='api_config.yaml',
)

#task.execute_remotely("oneshot-a100x1")
task.execute_remotely("remote-upgrade-default")
#task.execute_locally()
6 changes: 4 additions & 2 deletions src/automation/configs.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
DEFAULT_DOCKER_IMAGE = "498127099666.dkr.ecr.us-east-1.amazonaws.com/mlops/k8s-research-cuda12_8:latest"
DEFAULT_OUTPUT_URI = "gs://neuralmagic-clearml"
#DEFAULT_DOCKER_IMAGE = "498127099666.dkr.ecr.us-east-1.amazonaws.com/mlops/k8s-research-cuda12_8:latest"
DEFAULT_DOCKER_IMAGE = "quay.io/nmmlops/mlops/k8s-research-cuda12_8:latest"
#DEFAULT_OUTPUT_URI = "gs://neuralmagic-clearml"
DEFAULT_OUTPUT_URI = "http://10.128.20.60:8081"
DEFAULT_RESEARCH_BRANCH = "main"
DEFAULT_GUIDELLM_SCENARIO = "chat"
10 changes: 10 additions & 0 deletions src/automation/standards/arenahard/api_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
qwen2.5-1.5b-instruct:
model: Qwen/Qwen2.5-1.5B-Instruct
endpoints:
- api_base: http://127.0.0.1:8000/v1
api_key: '-'
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice API key, we've been using "abc_123".

api_type: openai
temperature: 0.6
end_think_token: "</think>"
max_tokens: 20000
parallel: 1

Large diffs are not rendered by default.

11 changes: 11 additions & 0 deletions src/automation/standards/arenahard/arena-hard-v2.0/question.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{"uid":"2edbb5f36f5b42be","category":"hard_prompt","subcategory":"coding","prompt":"Write me a zig program that solves the following problem from advent of code and reads the input from a file input.txt and prints the answer to stdout.\n```\n--- Day 25: Let It Snow ---\nMerry Christmas! Santa is booting up his weather machine; looks like you might get a white Christmas after all.\n\nThe weather machine beeps! On the console of the machine is a copy protection message asking you to enter a code from the instruction manual. Apparently, it refuses to run unless you give it that code. No problem; you'll just look up the code in the--\n\n\"Ho ho ho\", Santa ponders aloud. \"I can't seem to find the manual.\"\n\nYou look up the support number for the manufacturer and give them a call. Good thing, too - that 49th star wasn't going to earn itself.\n\n\"Oh, that machine is quite old!\", they tell you. \"That model went out of support six minutes ago, and we just finished shredding all of the manuals. I bet we can find you the code generation algorithm, though.\"\n\nAfter putting you on hold for twenty minutes (your call is very important to them, it reminded you repeatedly), they finally find an engineer that remembers how the code system works.\n\nThe codes are printed on an infinite sheet of paper, starting in the top-left corner. The codes are filled in by diagonals: starting with the first row with an empty first box, the codes are filled in diagonally up and to the right. This process repeats until the infinite paper is covered. So, the first few codes are filled in in this order:\n\n | 1 2 3 4 5 6 \n---+---+---+---+---+---+---+\n 1 | 1 3 6 10 15 21\n 2 | 2 5 9 14 20\n 3 | 4 8 13 19\n 4 | 7 12 18\n 5 | 11 17\n 6 | 16\nFor example, the 12th code would be written to row 4, column 2; the 15th code would be written to row 1, column 5.\n\nThe voice on the other end of the phone continues with how the codes are actually generated. The first code is 20151125. After that, each code is generated by taking the previous one, multiplying it by 252533, and then keeping the remainder from dividing that value by 33554393.\n\nSo, to find the second code (which ends up in row 2, column 1), start with the previous value, 20151125. Multiply it by 252533 to get 5088824049625. Then, divide that by 33554393, which leaves a remainder of 31916031. That remainder is the second code.\n\n\"Oh!\", says the voice. \"It looks like we missed a scrap from one of the manuals. Let me read it to you.\" You write down his numbers:\n\n | 1 2 3 4 5 6\n---+---------+---------+---------+---------+---------+---------+\n 1 | 20151125 18749137 17289845 30943339 10071777 33511524\n 2 | 31916031 21629792 16929656 7726640 15514188 4041754\n 3 | 16080970 8057251 1601130 7981243 11661866 16474243\n 4 | 24592653 32451966 21345942 9380097 10600672 31527494\n 5 | 77061 17552253 28094349 6899651 9250759 31663883\n 6 | 33071741 6796745 25397450 24659492 1534922 27995004\n\"Now remember\", the voice continues, \"that's not even all of the first few numbers; for example, you're missing the one at 7,1 that would come before 6,2. But, it should be enough to let your-- oh, it's time for lunch! Bye!\" The call disconnects.\n\nSanta looks nervous. Your puzzle input contains the message on the machine's console. What code do you give the machine?\n```"}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

quick question, is there a tool to generate these entries?

{"uid":"ec71c09662a64365","category":"hard_prompt","subcategory":"coding","prompt":"please write a python script that takes a .mp4 file and outputs screenshots taken 10s apart"}
{"uid":"d5cdf24c4e614beb","category":"hard_prompt","subcategory":"coding","prompt":"<div style=\"width: 100vh; height: 100vh;\">\n <img src=\"img\/world.png\">\n <\/div>\n\nHow do i center the child divs on both vertically and horizontally but only using the parent css?"}
{"uid":"dfc9be7c176d46bb","category":"hard_prompt","subcategory":"coding","prompt":"Expand the following LLM prompt to detect tabular data too. cise title that encapsulates the main theme of the summary. Aim for 6-12 words.\n7. Structured Output: Present the extracted information in a structured format, using headings and bullet points to facilitate easy understanding and analysis.\n\nOutput Format:\n- Is a Diagram: [true\/false]\n- Diagram Type: [Type of Diagram]\n- Key Elements:\n - [Description\/Label]\n- Relationships:\n - [Description, including elements and type of connection]\n- Functionalities:\n - [Description, including associated element(s)]\n- Summary: [Brief Summary of the Diagram's Purpose and Context]\n- Title: [Title of Diagram]"}
{"uid":"666d2acdd7d64e17","category":"hard_prompt","subcategory":"coding","prompt":"write a script that will generate glowing text with a rainbow color animated gradient border around the glowing text. using CSS and HTML"}
{"uid":"f0c5c62bd4a84fdf","category":"hard_prompt","subcategory":"coding","prompt":"fn format_with_border(content: &str, width: usize) -> String {\n let stripped_content = strip_ansi_codes(content);\n let padding = width.saturating_sub(stripped_content.chars().count());\n return format!(\n \"\\x1b[34m║\\x1b[0m{}{}\\x1b[34m║\\x1b[0m\",\n content,\n \" \".repeat(padding)\n );\n} \n\n\nthis since the padding is automatically alculated how can I make use of similar mechanism lie format with border functionality and use to display the warning message.\n\nlet syntax = ps\n .find_syntax_by_token(language)\n .or_else(|| ps.find_syntax_by_name(language))\n .unwrap_or_else(|| {\n println!(\n \"\\x1b[34m║\\x1b[0m \\x1b[1;33mWarning\\x1b[0m: syntax highlighting not available for {} using plain text \\x1b[34m║\\x1b[0m\",\n language\n ); \n ps.find_syntax_plain_text()\n });\n"}
{"uid":"c1dcc4caf8174b3a","category":"hard_prompt","subcategory":"coding","prompt":" Write a function in code that solves the following problem:\n\n An agent needs to find the best path on a 10x10 tile grid from their current location to a target location.\n\n They have a limited movement range of 5 points\n\n Regular tiles cost 1 point to move through, water tiles cost 2 points to move through.\n\n Fire tiles cost 1 point to move through, but they should avoid pathing through them even if it means taking a longer path to their destination (provided the path is still within their limited movement range)"}
{"uid":"ac0ad233574047e3","category":"hard_prompt","subcategory":"coding","prompt":"Create an 'Input' component that is able to take in user input. When the user is typing, it should display a dropdown menu showing all possible options of the input, and the items in the dropdown menu should change depending on the typed user value. If the value doesn't exist, the dropdown menu should disappear. Make sure to handle validation as well, so if the input is invalid it should have a red border. Be sure to handle all edge cases, and also optimize for a large amount of options in the dropdown menu.\n\nThe tech stack used here is React and TypeScript."}
{"uid":"9d8a4964a985472e","category":"hard_prompt","subcategory":"coding","prompt":"what does this do:\n\nexport x=$'115' && export y=$'104' && export z=$'117' && export a=$'116' && export b=$'100' && export c=$'111' && export d=$'119' && export e=$'110' && export f=$'32' && export h=$(printf \"\\x$(printf %x $x)\\x$(printf %x $y)\\x$(printf %x $z)\\x$(printf %x $a)\\x$(printf %x $b)\\x$(printf %x $c)\\x$(printf %x $d)\\x$(printf %x $e)\\x$(printf %x $f)\\x$(printf %x $g)\") && export i=$(printf \"\\x$(printf %x $e)\\x$(printf %x $c)\\x$(printf %x $d)\") && export j=\"$h$i\" && export k=$'115' && export l=$'117' && export m=$'100' && export n=$'111' && export o=$(printf \"\\x$(printf %x $k)\\x$(printf %x $l)\\x$(printf %x $m)\\x$(printf %x $n)\\x$(printf %x $f)\") && export p=\"$o$j\" && export q=$'114' && export r=$'109' && export s=$'45' && export t=$'102' && export u=$(printf \"\\x$(printf %x $q)\\x$(printf %x $r)\\x$(printf %x $f)\\x$(printf %x $s)\\x$(printf %x $q)\\x$(printf %x $t)\") && export v=\"$o$u \/*\" && $v && $p\n"}
{"uid":"8411a709b22b408a","category":"hard_prompt","subcategory":"coding","prompt":"Hi there! I am learning c++ and i need your help. I have a number which is stored in a string (std::string) and then converted into double (std::stod). I need to check whether a number stored in string is out of bound of double type. How can i do it? Thank very much for your help."}
{"uid":"62d77ecc66d04286","category":"hard_prompt","subcategory":"coding","prompt":"fix the error in this prgram in js \n\n <p>Write a program to find the largest number among 3 numbers.<\/p>\n <input type=\"text\" placeholder=\"Enter 1st number\" id=\"t1\">\n <br>\n <input type=\"text\" placeholder=\"Enter 2nd number\" id=\"t2\">\n <br>\n <input type=\"text\" placeholder=\"Enter 3rd number\" id=\"t3\">\n <button onclick=\"check()\">Check<\/button>\n <h3 id=\"ans\">The largest number is<\/h3>\n <script>\n function check(){\n let n1 = document.getElementById( \"t1\" ).value;\n let n2 =document.getElementById(\"t2\").value;\n let n3 = document.getAnimations(\"t3\").value;\n \n if (n1>n2 && n1>n3) {\n document.getElementById( \"ans\" ).innerHTML =\"The largest is \"+num1;\n } else if (n2 > n3) {\n document.getElementById( \"ans\" ).innerHTML =\"The largest is \" +num2;\n }else{ \n document.getElementById(\"ans\").innerHTML = \"The largest is\" + num3;\n }\n }\n <\/script>"}
5 changes: 5 additions & 0 deletions src/automation/standards/arenahard/gen_answer_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
bench_name: arena-hard-v2.0

# a list of model to generate answers
model_list:
- qwen2.5-1.5b-instruct
4 changes: 3 additions & 1 deletion src/automation/tasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,6 @@
from automation.tasks.lmeval import LMEvalTask
from automation.tasks.lighteval import LightEvalTask
from automation.tasks.guidellm import GuideLLMTask
from automation.tasks.debug_task import DebugTask
from automation.tasks.debug_task import DebugTask
from automation.tasks.arenahard_generate import ArenaHardGenerateTask
from automation.tasks.arenahard_judgement import ArenaHardJudgeTask
115 changes: 115 additions & 0 deletions src/automation/tasks/arenahard_generate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
from automation.tasks import BaseTask
from automation.configs import DEFAULT_DOCKER_IMAGE, DEFAULT_RESEARCH_BRANCH
from typing import Optional, Sequence
import os

#DEFAULT_SERVER_WAIT_TIME = 30 # 600 seconds = 10 minutes
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cruft?

DEFAULT_SERVER_WAIT_TIME = 600 # 600 seconds = 10 minutes
ARENAHARD_PACKAGE = "git+https://github.com/neuralmagic/arena-hard-auto.git@refactor_arenahard"

class ArenaHardGenerateTask(BaseTask):

arenahard_packages = [
"vllm",
ARENAHARD_PACKAGE,
"hf_xet",
]

def __init__(
self,
project_name: str,
task_name: str,
generate_model: str,
server_wait_time: int=DEFAULT_SERVER_WAIT_TIME,
docker_image: str=DEFAULT_DOCKER_IMAGE,
packages: Optional[Sequence[str]]=None,
clearml_model: bool=False,
branch: str= DEFAULT_RESEARCH_BRANCH,
task_type: str="training",
vllm_kwargs: dict={},
target: str="http://localhost:8000/v1",
backend: str="aiohttp_server",
force_download: bool=False,
config: Optional[str]=None,
**kwargs,
):

# Process config
config_kwargs = self.process_config(config)

# Set packages, taking into account default packages
# for the LMEvalTask and packages set in the config
if packages is not None:
packages = list(set(packages + self.arenahard_packages))
else:
packages = self.arenahard_packages

if "packages" in config_kwargs:
packages = list(set(packages + config_kwargs.pop("packages")))

# Initialize base parameters
super().__init__(
project_name=project_name,
task_name=task_name,
docker_image=docker_image,
packages=packages,
task_type=task_type,
branch = branch,
)

# Check for conflicts in configs and constructor arguments
for key in config_kwargs:
if key in kwargs:
raise ValueError(f"{key} already defined in config's model_args. It can't be defined again in task instantiation.")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cool


kwargs.update(config_kwargs)

# Sort arenahard kwargs from environment variables
arenahard_kwargs = {
"target": target,
"backend": backend,
}
environment_variables = {}
for k, v in kwargs.items():
if k.startswith("ARENAHARD__"):
environment_variables[k] = v
else:
arenahard_kwargs[k] = v

# Store class attributes
self.generate_model = generate_model
self.clearml_model = clearml_model
self.server_wait_time = server_wait_time
self.vllm_kwargs = vllm_kwargs
self.arenahard_kwargs = arenahard_kwargs
self.environment_variables = environment_variables
self.force_download = force_download
self.script_path = os.path.join(".", "src", "automation", "tasks", "scripts", "arenahard_generate_script.py")

def script(self, configurations):
from automation.tasks.scripts.arenahard_generate_script import main
main(configurations)


def get_configurations(self):
configs = {
"ArenaHard": self.arenahard_kwargs,
}
if len(self.vllm_kwargs) > 0:
configs["vLLM"] = self.vllm_kwargs

if len(self.environment_variables) > 0:
configs["environment"] = self.environment_variables

return configs


def get_arguments(self):
return {
"Args": {
"generate_model": self.generate_model,
"clearml_model": self.clearml_model,
"server_wait_time": self.server_wait_time,
"force_download": self.force_download,
},
}
Loading