11import asyncio
2- import json
32import tempfile
43import uuid
54from collections import defaultdict
1615
1716from lmms_eval .entrypoints .server_args import ServerArgs
1817
18+ # =============================================================================
19+ # Security Warning
20+ # =============================================================================
21+ # WARNING: This server is intended for use in trusted environments only.
22+ # Do NOT expose this server to untrusted networks without additional security
23+ # layers such as authentication, rate limiting, and network isolation.
24+ # =============================================================================
25+
26+
1927# =============================================================================
2028# Enums and Models
2129# =============================================================================
@@ -167,17 +175,10 @@ async def run_evaluation_subprocess(config: dict) -> dict:
167175 This allows GPU-based evaluation to run in a separate process
168176 while the server remains responsive.
169177 """
170- # Create temporary files for config and output
171- with tempfile .NamedTemporaryFile (mode = "w" , suffix = ".json" , delete = False ) as f :
172- json .dump (config , f )
173- config_path = f .name
174-
175- output_path = config .get ("output_dir" , tempfile .mktemp (suffix = "_results" ))
178+ output_path = config .get ("output_dir" ) or tempfile .mkdtemp (prefix = "lmms_eval_" )
176179
177180 # Build command
178181 num_gpus = config .get ("num_gpus" , 1 )
179-
180- # Use accelerate launch for multi-GPU support
181182 cmd = [
182183 "accelerate" ,
183184 "launch" ,
@@ -198,7 +199,7 @@ async def run_evaluation_subprocess(config: dict) -> dict:
198199 if isinstance (config ["model_args" ], dict ):
199200 model_args_str = "," .join (f"{ k } ={ v } " for k , v in config ["model_args" ].items ())
200201 else :
201- model_args_str = config ["model_args" ]
202+ model_args_str = str ( config ["model_args" ])
202203 cmd .extend (["--model_args" , model_args_str ])
203204
204205 if config .get ("batch_size" ):
@@ -211,7 +212,7 @@ async def run_evaluation_subprocess(config: dict) -> dict:
211212 cmd .extend (["--num_fewshot" , str (config ["num_fewshot" ])])
212213
213214 if config .get ("gen_kwargs" ):
214- cmd .extend (["--gen_kwargs" , config ["gen_kwargs" ]])
215+ cmd .extend (["--gen_kwargs" , str ( config ["gen_kwargs" ]) ])
215216
216217 if config .get ("log_samples" ):
217218 cmd .append ("--log_samples" )
@@ -225,10 +226,9 @@ async def run_evaluation_subprocess(config: dict) -> dict:
225226 proc = await asyncio .create_subprocess_exec (
226227 * cmd ,
227228 stdout = asyncio .subprocess .PIPE ,
228- stderr = asyncio .subprocess .STDOUT , # Combine stderr into stdout
229+ stderr = asyncio .subprocess .STDOUT ,
229230 )
230231
231- # Stream output in real-time
232232 while True :
233233 line = await proc .stdout .readline ()
234234 if not line :
@@ -237,13 +237,9 @@ async def run_evaluation_subprocess(config: dict) -> dict:
237237
238238 await proc .wait ()
239239
240- # Clean up config file
241- Path (config_path ).unlink (missing_ok = True )
242-
243240 if proc .returncode != 0 :
244241 raise RuntimeError (f"Evaluation failed with return code { proc .returncode } " )
245242
246- # Parse and return results from output directory
247243 return parse_output_directory (output_path )
248244
249245
0 commit comments