Skip to content

Commit 70ea32e

Browse files
committed
fix: agentenv-webshop evaluator success rate 0 error
1 parent 372c62f commit 70ea32e

File tree

3 files changed

+42
-16
lines changed

3 files changed

+42
-16
lines changed

openmanus_rl/agentgym/agentenv/agentenv/controller/task.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def __init__(
3535
self,
3636
client_args: Mapping[str, Any],
3737
tokenizer: PreTrainedTokenizerBase, # Tokenizer is now needed at init for output processing
38-
vllm_base_url: str = "http://localhost:8001/v1",
38+
vllm_base_url: str = "http://localhost:8002/v1",
3939
vllm_api_key: str = "dummy-key",
4040
vllm_model_name: str = "agent-llm",
4141
) -> None:
@@ -254,15 +254,17 @@ def _generate_experience_one(
254254
if len(conversation) > 50: # Safety break for very long conversations
255255
print("Warning: Max conversation turns reached.")
256256
break
257+
# full_text, full_input_ids, full_action_mask = self._reconstruct_tokenized_info(conversation)
257258

258259
return ExperienceOutput(
259260
conversation=conversation,
260-
reward=0,
261+
reward=reward,
261262
text=None,
262263
seq_ids=None,
263264
attention_mask=None,
264265
action_mask=None,
265266
)
267+
266268

267269
def _generate_experience_batch(
268270
self,

openmanus_rl/evaluation/run_vllm.sh

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,21 @@ else
1616
echo "Configured to use 4 GPUs: CUDA_VISIBLE_DEVICES=$visible_devices, tensor_parallel_size=$tensor_parallel_size"
1717
fi
1818

19+
MODEL_PATH="/data1/models/ZhipuAI/GLM-4-32B-0414"
20+
# MODEL_PATH="/data1/models/Qwen/Qwen3-4B-FP8"
21+
# MODEL_PATH="/data1/models/Qwen/Qwen3-235B-A22B-FP8"
22+
# MODEL_PATH= "THUDM/agentlm-7b"
23+
1924
# --- VLLM Command ---
2025
# Set environment variables and run the vllm server
2126
CUDA_VISIBLE_DEVICES="$visible_devices" \
2227
VLLM_USE_V1=0 \
23-
vllm serve /data1/models/Qwen/Qwen3-8B-FP8 \
28+
vllm serve "$MODEL_PATH" \
2429
--gpu-memory-utilization 0.95 \
2530
--tensor-parallel-size "$tensor_parallel_size" \
2631
--host 0.0.0.0 \
27-
--port 8001 \
32+
--port 8002 \
2833
--max-model-len 32768 \
29-
--served-model-name agent-llm
34+
--served-model-name agent-llm \
35+
# --enable-expert-parallel
3036
# --rope-scaling '{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":32768}' \

openmanus_rl/evaluation/vllm_eval_webshop.py

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ def evaluate_single_task(model_path, env_server_base, max_rounds, idx):
2424
try:
2525
tokenizer = AutoTokenizer.from_pretrained(model_path)
2626
except Exception as e:
27+
print(f"Failed to load tokenizer: {e}")
2728
return None # Cannot proceed without tokenizer
2829

2930
# Define generation config
@@ -46,6 +47,7 @@ def evaluate_single_task(model_path, env_server_base, max_rounds, idx):
4647
"timeout": 300,
4748
},
4849
n_clients=1, # Evaluate one task index at a time
50+
4951
)
5052

5153
# Initialize Evaluator
@@ -59,14 +61,30 @@ def evaluate_single_task(model_path, env_server_base, max_rounds, idx):
5961
)
6062

6163
# Extract experience data if successful
62-
if result and result.experiences:
63-
experience = result.experiences[0]
64-
# Return entire experience object including conversation, reward, and success
65-
return {
66-
"conversation": getattr(experience, 'conversation', None),
67-
"reward": getattr(experience, 'reward', 0.0),
68-
"success": 1 if getattr(experience, 'reward', 0.0) == 1 else 0
69-
}
64+
# if result and result.experiences:
65+
# experience = result.experiences[0]
66+
# # Return entire experience object including conversation, reward, and success
67+
# return {
68+
# "conversation": getattr(experience, 'conversation', None),
69+
# "reward": getattr(experience, 'reward', 0.0),
70+
# "success": 1 if getattr(experience, 'reward', 0.0) == 1 else 0
71+
# }
72+
73+
# Replace this section in your evaluate_single_task function
74+
if result:
75+
# Access raw experience objects directly
76+
if hasattr(result, 'experiences') and result.experiences:
77+
exp = result.experiences[0]
78+
# Print detailed debug information about the experience
79+
print(f"Task {idx} - Experience object type: {type(exp)}")
80+
print(f"Task {idx} - Available attributes: {dir(exp)}")
81+
print(f"Task {idx} - Raw reward value: {getattr(exp, 'reward', None)}")
82+
83+
return {
84+
"conversation": getattr(exp, 'conversation', []),
85+
"reward": getattr(exp, 'reward', 0.0), # Make sure we're accessing the raw reward
86+
"success": 1 if getattr(exp, 'reward', 0.0) == 1 else 0
87+
}
7088
else:
7189
return None
7290

@@ -80,13 +98,13 @@ def main():
8098

8199
# --- Argument Parsing ---
82100
parser = argparse.ArgumentParser(description='Run WebShop evaluation concurrently, initialize evaluator per worker, and save results to JSONL.')
83-
parser.add_argument('--model_name', type=str, default='Qwen3-8B', help='Name of the model being evaluated (e.g., AgentLM-7B)')
101+
parser.add_argument('--model_name', type=str, default='Qwen3-4B', help='Name of the model being evaluated (e.g., AgentLM-7B)')
84102
parser.add_argument('--sector', type=str, default='eval', help='Sector or domain of the evaluation (e.g., WebShop)')
85103
parser.add_argument('--num_tasks', type=int, default=100, help='Number of tasks to process (default: 100)')
86104
parser.add_argument('--max_workers', type=int, default=20, help='Maximum number of concurrent workers (default: 20)')
87-
parser.add_argument('--model_path', type=str, default="/data1/models/Qwen/Qwen3-8B-FP8", help='Path to the model directory')
105+
parser.add_argument('--model_path', type=str, default="/data1/models/Qwen/Qwen3-4B-FP8", help='Path to the model directory')
88106
parser.add_argument('--env_server_base', type=str, default="http://127.0.0.1:36001", help='Base URL for the environment server')
89-
parser.add_argument('--max_rounds', type=int, default=7, help='Maximum interaction rounds per task (default: 7)')
107+
parser.add_argument('--max_rounds', type=int, default=20, help='Maximum interaction rounds per task (default: 7)')
90108
parser.add_argument('--output_file', type=str, default="", help='Output file path (default: {model_name}_{sector}.jsonl)')
91109

92110
args = parser.parse_args()

0 commit comments

Comments
 (0)