@@ -24,6 +24,7 @@ def evaluate_single_task(model_path, env_server_base, max_rounds, idx):
2424 try :
2525 tokenizer = AutoTokenizer .from_pretrained (model_path )
2626 except Exception as e :
27+ print (f"Failed to load tokenizer: { e } " )
2728 return None # Cannot proceed without tokenizer
2829
2930 # Define generation config
@@ -46,6 +47,7 @@ def evaluate_single_task(model_path, env_server_base, max_rounds, idx):
4647 "timeout" : 300 ,
4748 },
4849 n_clients = 1 , # Evaluate one task index at a time
50+
4951 )
5052
5153 # Initialize Evaluator
@@ -59,14 +61,30 @@ def evaluate_single_task(model_path, env_server_base, max_rounds, idx):
5961 )
6062
6163 # Extract experience data if successful
62- if result and result .experiences :
63- experience = result .experiences [0 ]
64- # Return entire experience object including conversation, reward, and success
65- return {
66- "conversation" : getattr (experience , 'conversation' , None ),
67- "reward" : getattr (experience , 'reward' , 0.0 ),
68- "success" : 1 if getattr (experience , 'reward' , 0.0 ) == 1 else 0
69- }
64+ # if result and result.experiences:
65+ # experience = result.experiences[0]
66+ # # Return entire experience object including conversation, reward, and success
67+ # return {
68+ # "conversation": getattr(experience, 'conversation', None),
69+ # "reward": getattr(experience, 'reward', 0.0),
70+ # "success": 1 if getattr(experience, 'reward', 0.0) == 1 else 0
71+ # }
72+
73+ # Replace this section in your evaluate_single_task function
74+ if result :
75+ # Access raw experience objects directly
76+ if hasattr (result , 'experiences' ) and result .experiences :
77+ exp = result .experiences [0 ]
78+ # Print detailed debug information about the experience
79+ print (f"Task { idx } - Experience object type: { type (exp )} " )
80+ print (f"Task { idx } - Available attributes: { dir (exp )} " )
81+ print (f"Task { idx } - Raw reward value: { getattr (exp , 'reward' , None )} " )
82+
83+ return {
84+ "conversation" : getattr (exp , 'conversation' , []),
85+ "reward" : getattr (exp , 'reward' , 0.0 ), # Make sure we're accessing the raw reward
86+ "success" : 1 if getattr (exp , 'reward' , 0.0 ) == 1 else 0
87+ }
7088 else :
7189 return None
7290
@@ -80,13 +98,13 @@ def main():
8098
8199 # --- Argument Parsing ---
82100 parser = argparse .ArgumentParser (description = 'Run WebShop evaluation concurrently, initialize evaluator per worker, and save results to JSONL.' )
83- parser .add_argument ('--model_name' , type = str , default = 'Qwen3-8B ' , help = 'Name of the model being evaluated (e.g., AgentLM-7B)' )
101+ parser .add_argument ('--model_name' , type = str , default = 'Qwen3-4B ' , help = 'Name of the model being evaluated (e.g., AgentLM-7B)' )
84102 parser .add_argument ('--sector' , type = str , default = 'eval' , help = 'Sector or domain of the evaluation (e.g., WebShop)' )
85103 parser .add_argument ('--num_tasks' , type = int , default = 100 , help = 'Number of tasks to process (default: 100)' )
86104 parser .add_argument ('--max_workers' , type = int , default = 20 , help = 'Maximum number of concurrent workers (default: 20)' )
87- parser .add_argument ('--model_path' , type = str , default = "/data1/models/Qwen/Qwen3-8B -FP8" , help = 'Path to the model directory' )
105+ parser .add_argument ('--model_path' , type = str , default = "/data1/models/Qwen/Qwen3-4B -FP8" , help = 'Path to the model directory' )
88106 parser .add_argument ('--env_server_base' , type = str , default = "http://127.0.0.1:36001" , help = 'Base URL for the environment server' )
89- parser .add_argument ('--max_rounds' , type = int , default = 7 , help = 'Maximum interaction rounds per task (default: 7)' )
107+ parser .add_argument ('--max_rounds' , type = int , default = 20 , help = 'Maximum interaction rounds per task (default: 7)' )
90108 parser .add_argument ('--output_file' , type = str , default = "" , help = 'Output file path (default: {model_name}_{sector}.jsonl)' )
91109
92110 args = parser .parse_args ()
0 commit comments