fix typo

HappyWaterXP · HappyWaterXP · commit 66066566d0b2 · 2025-07-01T18:52:12.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -144,4 +144,5 @@ test/
 
 easy_*
 normal_*
-outputs_*
+outputs_*
+*_outputs
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
 1. Embodied-Planner-R1 is based on verl with vLLM>=0.8
 ```
 # Create the conda environment
-conda create -n Embodied-Planner-R1 python==3.10
+conda create -n Embodied-Planner-R1 python=3.10
 conda activate Embodied-Planner-R1
 
 cd Embodied-Planner-R1
@@ -36,6 +36,7 @@ conda create --name scienceworld python=3.8
 conda activate scienceworld
 
 pip install scienceworld
+conda install -y -c conda-forge openjdk=11
 pip install fastapi
 pip install uvicorn
 ```
@@ -54,3 +55,25 @@ bash cmd/alf.sh
 
 bash cmd/sci_easy.sh
 ```
+
+## 4. Evaluation
+```
+# We follow the framework of MINT to evaluate models.
+cd verl/eval_agent
+conda create -n eval_agent python=3.10
+conda activate eval_agent
+bash setup.sh
+
+conda create -n vllm python=3.10
+conda activate vllm
+pip install vllm
+
+# deploy the model
+python -m vllm.entrypoints.openai.api_server --served-model-name embodied_r1_alfworld --model /path/to/model --port 8000 --disable-frontend-multiprocessing --gpu-memory-utilization 0.99 --disable-frontend-multiprocessing --max-model-len 4096 --enforce-eager
+
+# start evaluation
+conda activate eval_agent
+
+python -m eval_agent.main --agent_config er1_alfworld --exp_config alfworld_v2 --split dev --verbose # you can find more examples in eval.sh
+
+```
diff --git a/cmd/sci_normal.sh b/cmd/sci_normal.sh
@@ -43,7 +43,7 @@ else
 fi
 
 cd $REPO_HOME
-conda activate embodied-r1
+conda activate Embodied-Planner-R1
 cmd="bash ${bash_path}"
 echo "Running $cmd"
 
diff --git a/get_data/alfworld/base_config.yaml b/get_data/alfworld/base_config.yaml
@@ -0,0 +1,145 @@
+dataset:
+  data_path: '$ALFWORLD_DATA/json_2.1.1/train'
+  eval_id_data_path: '$ALFWORLD_DATA/json_2.1.1/valid_seen'    # null/None to disable
+  eval_ood_data_path: '$ALFWORLD_DATA/json_2.1.1/valid_unseen' # null/None to disable
+  num_train_games: -1                                          # max training games (<=0 indicates full dataset)
+  num_eval_games: -1                                           # max evaluation games (<=0 indicates full dataset)
+
+logic:
+  domain: '$ALFWORLD_DATA/logic/alfred.pddl'                   # PDDL domain file that defines the world dynamics
+  grammar: '$ALFWORLD_DATA/logic/alfred.twl2'                  # Grammar file that defines the text feedbacks
+
+env:
+  type: 'AlfredTWEnv'                                          # 'AlfredTWEnv' or 'AlfredThorEnv' or 'AlfredHybrid'
+  regen_game_files: False                                      # check if game is solvable by expert and save to game.tw-pddl file
+  domain_randomization: False                                  # shuffle Textworld print order and object id nums
+  task_types: [1, 2, 3, 4, 5, 6]                               # task-type ids: 1 - Pick & Place, 2 - Examine in Light, 3 - Clean & Place, 4 - Heat & Place, 5 - Cool & Place, 6 - Pick Two & Place
+  expert_timeout_steps: 150                                    # max steps before timeout for expert to solve the task
+  expert_type: "handcoded"                                     # 'handcoded' or 'downward'. Note: the downward planner is very slow for real-time use
+  goal_desc_human_anns_prob: 0.0                               # prob of using human-annotated goal language instead of templated goals (1.0 indicates all human annotations from ALFRED)
+
+  hybrid:
+    start_eps: 100000                                          # starting episode of hybrid training, tw-only training upto this point
+    thor_prob: 0.5                                             # prob of AlfredThorEnv during hybrid training
+    eval_mode: "tw"                                            # 'tw' or 'thor' - env used for evaluation during hybrid training
+
+  thor:
+    screen_width: 300                                          # width of THOR window
+    screen_height: 300                                         # height of THOR window
+    smooth_nav: False                                          # smooth rotations, looks, and translations during navigation (very slow)
+    save_frames_to_disk: False                                 # save frame PNGs to disk (useful for making videos)
+    save_frames_path: './videos/'                              # path to save frame PNGs
+
+controller:
+  type: 'oracle'                                               # 'oracle' or 'oracle_astar' or 'mrcnn' or 'mrcnn_astar' (aka BUTLER)
+  debug: False
+  load_receps: True                                            # load receptacle locations from precomputed dict (if available)
+
+mask_rcnn:
+  pretrained_model_path: '$ALFWORLD_DATA/detectors/mrcnn.pth'
+
+general:
+  random_seed: 42
+  use_cuda: True                                               # disable this when running on machine without cuda
+  visdom: False                                                # plot training/eval curves, run with visdom server
+  task: 'alfred'
+  training_method: 'dagger'                                    # 'dqn' or 'dagger'
+  save_path: './training/'                                     # path to save pytorch models
+  observation_pool_capacity: 3                                 # k-size queue, 0 indicates no observation
+  hide_init_receptacles: False                                 # remove initial observation containing navigable receptacles
+
+  training:
+    batch_size: 10
+    max_episode: 50000
+    smoothing_eps: 0.1
+    optimizer:
+      learning_rate: 0.001
+      clip_grad_norm: 5
+
+  evaluate:
+    run_eval: True
+    batch_size: 10
+    env:
+      type: "AlfredTWEnv"
+
+  checkpoint:
+    report_frequency: 1000                                    # report every N episode
+    experiment_tag: 'test'                                    # name of experiment
+    load_pretrained: False                                    # during test, enable this so that the agent load your pretrained model
+    load_from_tag: 'not loading anything'                     # name of pre-trained model to load in save_path
+
+  model:
+    encoder_layers: 1
+    decoder_layers: 1
+    encoder_conv_num: 5
+    block_hidden_dim: 64
+    n_heads: 1
+    dropout: 0.1
+    block_dropout: 0.1
+    recurrent: True
+
+rl:
+  action_space: "admissible"                                  # 'admissible' (candidates from text engine) or 'generation' (seq2seq-style generation) or 'beam_search_choice' or 'exhaustive' (not working)
+  max_target_length: 20                                       # max token length for seq2seq generation
+  beam_width: 10                                              # 1 means greedy
+  generate_top_k: 3
+
+  training:
+    max_nb_steps_per_episode: 50                              # terminate after this many steps
+    learn_start_from_this_episode: 0                          # delay updates until this epsiode
+    target_net_update_frequency: 500                          # sync target net with online net per this many epochs
+
+  replay:
+    accumulate_reward_from_final: True
+    count_reward_lambda: 0.0                                  # 0 to disable
+    novel_object_reward_lambda: 0.0                           # 0 to disable
+    discount_gamma_game_reward: 0.9
+    discount_gamma_count_reward: 0.5
+    discount_gamma_novel_object_reward: 0.5
+    replay_memory_capacity: 500000                            # adjust this depending on your RAM size
+    replay_memory_priority_fraction: 0.5
+    update_per_k_game_steps: 5
+    replay_batch_size: 64
+    multi_step: 3
+    replay_sample_history_length: 4
+    replay_sample_update_from: 2
+
+  epsilon_greedy:
+    noisy_net: False                                          # if this is true, then epsilon greedy is disabled
+    epsilon_anneal_episodes: 1000                             # -1 if not annealing
+    epsilon_anneal_from: 0.3
+    epsilon_anneal_to: 0.1
+
+dagger:
+  action_space: "generation"                                  # 'admissible' (candidates from text engine) or 'generation' (seq2seq-style generation) or 'exhaustive' (not working)
+  max_target_length: 20                                       # max token length for seq2seq generation
+  beam_width: 10                                              # 1 means greedy
+  generate_top_k: 5
+  unstick_by_beam_search: False                               # use beam-search for failed actions, set True during evaluation
+
+  training:
+    max_nb_steps_per_episode: 50                              # terminate after this many steps
+
+  fraction_assist:
+    fraction_assist_anneal_episodes: 50000
+    fraction_assist_anneal_from: 1.0
+    fraction_assist_anneal_to: 0.01
+
+  fraction_random:
+    fraction_random_anneal_episodes: 0
+    fraction_random_anneal_from: 0.0
+    fraction_random_anneal_to: 0.0
+
+  replay:
+    replay_memory_capacity: 500000
+    update_per_k_game_steps: 5
+    replay_batch_size: 64
+    replay_sample_history_length: 4
+    replay_sample_update_from: 2
+
+vision_dagger:
+  model_type: "resnet"                                        # 'resnet' (whole image features) or 'maskrcnn_whole' (whole image MaskRCNN feats) or 'maskrcnn' (top k MaskRCNN detection feats) or 'no_vision' (zero vision input)
+  resnet_fc_dim: 64
+  maskrcnn_top_k_boxes: 10                                    # top k box features
+  use_exploration_frame_feats: False                          # append feats from initial exploration (memory intensive!)
+  sequence_aggregation_method: "average"                      # 'sum' or 'average' or 'rnn'