@@ -281,6 +281,7 @@ async def evaluate_response(self, prompt: str, response: str, target: str) -> fl
281281
282282 # Extract reward from result
283283 reward = result .reward if result .reward is not None else 0.0
284+ record_metric ("reward/julia/reward" , reward , Reduce .MEAN )
284285 obs = result .observation
285286
286287 passed = obs .tests_passed
@@ -293,19 +294,20 @@ async def evaluate_response(self, prompt: str, response: str, target: str) -> fl
293294 print (f" Tests Passed: { passed } " )
294295 print (f" Tests Failed: { failed } " )
295296 print (f" Total Tests: { total } " )
297+ print (f" Exit Code: { obs .exit_code } " )
298+ print (f" Code Compiles: { obs .code_compiles } " )
296299
297300 if obs .stderr :
298- print (f" Stderr: { obs .stderr [:200 ]} " )
301+ print (f" Stderr: { obs .stderr [:500 ]} " )
299302 record_metric ("reward/julia/has_errors" , 1 , Reduce .SUM )
300303
301- if obs .error_message :
302- print (f" Error Message : { obs .error_message [:200 ]} " )
304+ if obs .stdout :
305+ print (f" Stdout (first 200 chars) : { obs .stdout [:200 ]} " )
303306
304307 # Log metrics
305- record_metric ("reward/julia/tests_passed" , passed , Reduce .SUM )
306- record_metric ("reward/julia/tests_failed" , failed , Reduce .SUM )
307- record_metric ("reward/julia/tests_total" , total , Reduce .SUM )
308- record_metric ("reward/julia/pass_rate" , reward , Reduce .MEAN )
308+ pass_rate = passed / total if total > 0 else 0.0
309+
310+ record_metric ("reward/julia/pass_rate" , pass_rate , Reduce .MEAN )
309311
310312 print (f"Final Reward: { reward :.3f} " )
311313 print ("=" * 80 )
@@ -337,7 +339,7 @@ def _extract_code(self, response: str) -> str:
337339class ComputeAdvantages (ForgeActor ):
338340 @endpoint
339341 async def compute (self , group : Group ) -> list [float ]:
340- rewards = torch .tensor ([[e .reward for e in group ]])
342+ rewards = torch .tensor ([[e .reward for e in group ]], dtype = torch . float32 )
341343 mean = rewards .mean (1 , keepdim = True )
342344 std = rewards .std (1 , keepdim = True )
343345 advantages = (rewards - mean ) / (std + 1e-4 )
@@ -517,6 +519,14 @@ async def main(cfg: DictConfig):
517519 request_timeout_s = openenv_config .get ("request_timeout_s" , 120.0 )
518520 container_memory_gb = openenv_config .get ("container_memory_gb" , 4 )
519521
522+ # Set PORT and NUM_WORKER environment variables for the Julia server
523+ # These match the Dockerfile defaults
524+ if "PORT" not in env_vars :
525+ env_vars ["PORT" ] = str (openenv_config .get ("port" , 8000 ))
526+ if "NUM_WORKER" not in env_vars :
527+ env_vars ["NUM_WORKER" ] = str (openenv_config .get ("num_worker" , 4 ))
528+ if "JULIA_MAX_WORKERS" not in env_vars :
529+ env_vars ["JULIA_MAX_WORKERS" ] = str (openenv_config .get ("julia_max_workers" , 16 ))
520530 julia_env_actor = await GenericOpenEnvActor .options (
521531 ** cfg .actors .julia_env
522532 ).as_actor (
@@ -587,12 +597,14 @@ async def continuous_rollouts():
587597 responses : list [Completion ] = await policy .generate .route (prompt )
588598 t .step ("policy_generation" )
589599
590- # Construct episodes and calculate rewards
600+ # Construct episodes and calculate rewards in parallel
591601 episodes = []
592602 input_ids = torch .ones (
593603 (group_size , max_req_tokens + max_res_tokens ),
594604 dtype = torch .long ,
595605 )
606+
607+ # Create episodes first
596608 for i , response in enumerate (responses ):
597609 episode = Episode (
598610 episode_id = str (uuid .uuid4 ()),
@@ -602,12 +614,20 @@ async def continuous_rollouts():
602614 target = target ,
603615 completion = response ,
604616 )
605- episode .reward = await reward_actor .evaluate_response .route (
617+ episodes .append (episode )
618+
619+ # Evaluate all rewards in parallel
620+ reward_tasks = [
621+ reward_actor .evaluate_response .route (
606622 prompt = prompt , response = response .text , target = target
607623 )
608- episodes .append (episode )
624+ for response in responses
625+ ]
626+ rewards = await asyncio .gather (* reward_tasks )
609627
610- # Build input_ids for reference logprobs
628+ # Assign rewards and build input_ids
629+ for i , (episode , reward ) in enumerate (zip (episodes , rewards )):
630+ episode .reward = reward
611631 input_ids [i , :max_req_tokens ] = episode .request_tensor
612632 input_ids [i , max_req_tokens :] = episode .response_tensor
613633
0 commit comments