Alfworld Concatenated Multi-turn RFT SFT format AND settings. (#442)

kokolerk · hiyuchang · web-flow · commit 25b8e11bc346 · 2025-12-15T19:57:09.000+08:00
Co-authored-by: Yuchang Sun &lt;52027540+hiyuchang@users.noreply.github.com&gt;
diff --git a/examples/grpo_alfworld/README.md b/examples/grpo_alfworld/README.md
@@ -5,3 +5,51 @@ This example shows the usage of GRPO on the ALFWorld dataset.
 For more detailed information, please refer to the [documentation](../../docs/sphinx_doc/source/tutorial/example_multi_turn.md).
 
 The config file is located in [`alfworld.yaml`](alfworld.yaml).
+
+NOTE: For the Concatenated Multi-Turn RFT setup in the Qwen-2.5 series, the model may not follow the `<think></think><action></action>` format. You may need to perform SFT first, then GRPO.
+
+The SFT data should be named as `<TRINITY_SFT_DATASET_PATH>/data.json`, following the format:
+
+```
+[
+    {
+         "messages": [
+            {
+                "role": "system", # fixed, align with the grpo workflow: alfworld_workflow.
+                "content": "\nYou are an agent interacting with a virtual test-based environments.\n\n## Notes:\nAt each step, you should first think then perform action to fulfill the instruction. You should ALWAYS wrap your thinking with the   tag and wrap your action with the   tag.\nYou should ALWAYS take one action each step. \nYou should finish the task and buy the item within 15 steps.\nDONOT try to interact with the user at anytime. Finish the task and buy the item by yourself.\n\n## Action Format:\nBelow are the available commands you can use:\n  look:                             look around your current location\n  inventory:                        check your current inventory(you can only have 1 item in your inventory)\n  go to (receptacle):               move to a receptacle\n  open (receptacle):                open a receptacle\n  close (receptacle):               close a receptacle\n  take (object) from (receptacle):  take an object from a receptacle\n  move (object) to (receptacle):  place an object in or on a receptacle\n  examine (something):              examine a receptacle or an object\n  use (object):                     use an object\n  heat (object) with (receptacle):  heat an object using a receptacle\n  clean (object) with (receptacle): clean an object using a receptacle\n  cool (object) with (receptacle):  cool an object using a receptacle\n  slice (object) with (object):     slice an object using a sharp object\n\nFor example your output should be like this:\n To solve the task, I need first to ... go to cabinet 1\n"
+            },
+            {
+                "role": "user",
+                "content": "Observation: {observation by alfworld}"
+            },
+            {
+                "role": "assistant",
+                "content": "<think>think process</think><action>action</action>"
+            },
+            {
+                "role": "user",
+                "content": "Observation: {observation by alfworld}"
+            },
+            {
+                "role": "assistant",
+                "content": "<think>think process</think><action>action</action>"
+            },
+            .....
+        ],
+    },
+    {
+        "messages": [
+            {
+                ......
+            },
+        ]
+    },
+    {
+         "messages": [
+            {
+                .......
+            },
+        ]
+    },
+]
+```
diff --git a/examples/grpo_alfworld/alfworld.yaml b/examples/grpo_alfworld/alfworld.yaml
@@ -7,9 +7,9 @@ algorithm:
   optimizer:
     lr: 1e-6
 model:
-  model_path: ${oc.env:TRINITY_MODEL_PATH,Qwen/Qwen2.5-7B-Instruct}
-  max_response_tokens: 16384
-  max_model_len: 20480
+  model_path: ${oc.env:TRINITY_MODEL_PATH,Qwen/Qwen2.5-3B-Instruct}
+  max_prompt_tokens: 10240 # input max tokens every turn
+  max_response_tokens: 4096 # output max tokens every turn
 cluster:
   node_num: 1
   gpu_per_node: 8
@@ -77,4 +77,5 @@ trainer:
 #           format:
 #             prompt_type: messages
 #             messages_key: 'messages'
+#.            enable_concatenated_multi_turn: true # Enable concatenated multi-turn SFT data preprocess, default is false
 #   - stage_name: rft