diff --git a/docs_roll/docs/English/QuickStart/alicloud_pipeline_quick_start_en.md b/docs_roll/docs/English/QuickStart/alicloud_pipeline_quick_start_en.md
deleted file mode 100644
index 70da5dc47..000000000
--- a/docs_roll/docs/English/QuickStart/alicloud_pipeline_quick_start_en.md
+++ /dev/null
@@ -1,145 +0,0 @@
-# Quickstart: Singel Node based on Alibaba Cloud
-
-## Environment Preparation
-1. Purchase an Alibaba Cloud Server
-- For a single-machine setup, consider a GPU instance with **NVIDIA V100**.
-- **Recommendation:** When purchasing a GPU instance via the ECS console, it's advised to select the option to automatically install GPU drivers.
-2. Remote Connect to the GPU Instance and access the machine terminal
-3. Install NVIDIA Container Toolkit
-```shell
-curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | \
-  sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo
-  
-sudo yum install -y nvidia-container-toolkit
-```
-4. Install Docker Environment：refer to https://developer.aliyun.com/mirror/docker-ce/
-```shell
-# step 1: install necessary system tools
-sudo yum install -y yum-utils
-
-# Step 2: add software repository information
-sudo yum-config-manager --add-repo https://mirrors.aliyun.com/docker-ce/linux/centos/docker-ce.repo
-
-# Step 3: install Docker
-sudo yum install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
-
-# Step 4: start Docker service
-sudo service docker start
-
-# Verify that GPUs are visible
-docker version
-```
-
-
-## Environment Configuration
-```shell
-# 1. Pull Docker image
-sudo docker pull <image_address>
-
-# Image Addresses (choose based on your needs)
-# torch2.6.0 + SGlang0.4.6: roll-registry.cn-hangzhou.cr.aliyuncs.com/roll/pytorch:nvcr-24.05-py3-torch260-sglang046
-# torch2.6.0 + vLLM0.8.4: roll-registry.cn-hangzhou.cr.aliyuncs.com/roll/pytorch:nvcr-24.05-py3-torch260-vllm084
-# torch2.5.1 + SGlang0.4.3: roll-registry.cn-hangzhou.cr.aliyuncs.com/roll/pytorch:nvcr-24.05-py3-torch251-sglang043
-# torch2.5.1 + vLLM0.7.3: roll-registry.cn-hangzhou.cr.aliyuncs.com/roll/pytorch:nvcr-24.05-py3-torch251-vllm073
-
-# 2. Start a Docker container with GPU support and keep the container running
-sudo docker images
-sudo docker run -dit \
-  --gpus all \
-  --network=host \
-  --ipc=host \
-  --shm-size=2gb \
-  <image_id> \
-  /bin/bash
-
-# 3. Enter the Docker container (execute this command every time you reconnect)
-sudo docker ps
-sudo docker exec -it <container_id> /bin/bash
-
-# 4. Verify GPU visibility
-nvidia-smi
-
-# 5. Download the code
-
-# Install git（this image is ubuntu-based）and clone the repo
-apt update && apt install git -y
-git clone https://github.com/alibaba/ROLL.git
-
-# If Github is not accessible, download the zip file and unzip it
-wget https://github.com/alibaba/ROLL/archive/refs/heads/main.zip
-unzip main.zip
-
-# 5. Install dependencies (select the requirements file corresponding to your chosen image)
-cd ROLL-main
-pip install -r requirements_torch260_sglang.txt -i https://mirrors.aliyun.com/pypi/simple/
-```
-
-## Pipeline Execution
-```shell
-# If you encounter "ModuleNotFoundError: No module named 'roll'", you need to add environment variables
-export PYTHONPATH="/workspace/ROLL-main:$PYTHONPATH"
-
-# Method 1: Specify the YAML file path, with the script directory (examples) as the root
-python examples/start_agentic_pipeline.py --config_path qwen2.5-0.5B-agentic_ds  --config_name agent_val_frozen_lake
-
-# Method 2: Execute the shell script directly
-bash examples/qwen2.5-0.5B-agentic_ds/run_agentic_pipeline_frozen_lake.sh
-
-# Modify the configuration as needed
-vim examples/qwen2.5-0.5B-agentic_ds/agent_val_frozen_lake.yaml
-```
-
-Key Configuration Modifications for Single V100 GPU Memory:
-```yaml
-# Reduce the system's expected number of GPUs from 8 to your actual 1 V100
-num_gpus_per_node: 1 
-# Training processes are now mapped only to GPU 0
-actor_train.device_mapping: list(range(0,1))
-# Inference processes are now mapped only to GPU 0
-actor_infer.device_mapping: list(range(0,1))
-# Reference model processes are now mapped only to GPU 0
-reference.device_mapping: list(range(0,1))
-
-# Significantly reduce the batch sizes for Rollout and Validation stages to prevent out-of-memory errors on a single GPU
-rollout_batch_size: 16
-val_batch_size: 16
-
-# V100 has better native support for FP16 than BF16 (unlike A100/H100). Switching to FP16 improves compatibility and stability, while also saving GPU memory.
-actor_train.model_args.dtype: fp16
-actor_infer.model_args.dtype: fp16
-reference.model_args.dtype: fp16
-
-# Switch the large model training framework from DeepSpeed to Megatron-LM. Parameters can be sent in batches, resulting in faster execution.
-strategy_name: megatron_train
-strategy_config:
-  tensor_model_parallel_size: 1
-  pipeline_model_parallel_size: 1
-  expert_model_parallel_size: 1
-  use_distributed_optimizer: true
-  recompute_granularity: full
-
-# In megatron training the global train batch size is equivalent to per_device_train_batch_size * gradient_accumulation_steps * world_size
-actor_train.training_args.per_device_train_batch_size: 1
-actor_train.training_args.gradient_accumulation_steps: 16  
-
-# Reduce the maximum number of actions per trajectory, making each Rollout trajectory shorter that reduces the length of LLM-generated content.
-max_actions_per_traj: 10    
-
-# Reduce the number of parallel training and validation environment groups to accommodate single-GPU resources.
-train_env_manager.env_groups: 1
-train_env_manager.n_groups: 1
-val_env_manager.env_groups: 2
-val_env_manager.n_groups: [1, 1]
-val_env_manager.tags: [SimpleSokoban, FrozenLake]
-
-# Reduce the total number of training steps for quicker full pipeline runs, useful for rapid debugging.
-max_steps: 100
-```
-
-Example Log Screenshots during Pipeline Execution:
-![log1](../../../static/img/log_1.png)
-
-![log2](../../../static/img/log_2.png)
-
-![log3](../../../static/img/log_3.png)
-
diff --git a/docs_roll/docs/English/QuickStart/image_address.md b/docs_roll/docs/English/QuickStart/image_address.md
new file mode 100644
index 000000000..cf118e2a2
--- /dev/null
+++ b/docs_roll/docs/English/QuickStart/image_address.md
@@ -0,0 +1,5 @@
+# Image provided:
+torch2.6.0 + SGlang0.4.6: roll-registry.cn-hangzhou.cr.aliyuncs.com/roll/pytorch:nvcr-24.05-py3-torch260-sglang046
+torch2.6.0 + vLLM0.8.4: roll-registry.cn-hangzhou.cr.aliyuncs.com/roll/pytorch:nvcr-24.05-py3-torch260-vllm084
+torch2.5.1 + SGlang0.4.3: roll-registry.cn-hangzhou.cr.aliyuncs.com/roll/pytorch:nvcr-24.05-py3-torch251-sglang043
+torch2.5.1 + vLLM0.7.3: roll-registry.cn-hangzhou.cr.aliyuncs.com/roll/pytorch:nvcr-24.05-py3-torch251-vllm073
\ No newline at end of file
diff --git a/docs_roll/docs/English/QuickStart/single_node_quick_start_en.md b/docs_roll/docs/English/QuickStart/single_node_quick_start_en.md
new file mode 100644
index 000000000..2d7005aa1
--- /dev/null
+++ b/docs_roll/docs/English/QuickStart/single_node_quick_start_en.md
@@ -0,0 +1,97 @@
+# Quickstart: Singel Node Deployment Guide
+
+## Environment Preparation
+1. Purchase a machine equipped with GPU and install GPU drivers simultaneously
+2. Connect remotely to the GPU instance and access the machine terminal
+3. Install Docker environment and NVIDIA Container Toolkit
+```shell
+curl -fsSL https://github.com/alibaba/ROLL/blob/main/scripts/install_docker_nvidia_container_toolkit.sh  | sudo bash   
+```
+
+## Environment Configuration
+Choose your desired image from the [image addresses](https://github.com/alibaba/ROLL/blob/main/docs_roll/docs/English/QuickStart/image_address.md). The following example will use *torch2.6.0 + vLLM0.8.4*.
+```shell
+# 1. Start a Docker container with GPU support, expose the port, and keep the container running.
+sudo docker run -dit \
+  --gpus all \
+  -p 9001:22 \
+  --ipc=host \
+  --shm-size=10gb \
+  roll-registry.cn-hangzhou.cr.aliyuncs.com/roll/pytorch:nvcr-24.05-py3-torch260-vllm084 \
+  /bin/bash
+
+# 2. Enter the Docker container
+#    You can find your running container's ID or name using `sudo docker ps`.
+sudo docker ps
+sudo docker exec -it <container_id> /bin/bash
+
+# 3. Verify GPU visibility
+nvidia-smi
+
+# 4. Clone the project repo
+git clone https://github.com/alibaba/ROLL.git
+
+# 5. Install dependencies (select the requirements file corresponding to your chosen image)
+cd ROLL
+pip install -r requirements_torch260_vllm.txt -i https://mirrors.aliyun.com/pypi/simple/
+```
+
+## Pipeline Execution
+```shell
+bash examples/agentic_demo/run_agentic_pipeline_frozen_lake_single_node_demo.sh  
+```
+
+Example Log Screenshots during Pipeline Execution:
+![log1](../../../static/img/log_1.png)
+
+![log2](../../../static/img/log_2.png)
+
+![log3](../../../static/img/log_3.png)
+
+
+## Reference: V100 Single-GPU Memory Configuration Optimization
+```yaml
+# Reduce the system's expected number of GPUs from 8 to your actual 1 V100
+num_gpus_per_node: 1 
+# Training processes are now mapped only to GPU 0
+actor_train.device_mapping: list(range(0,1))
+# Inference processes are now mapped only to GPU 0
+actor_infer.device_mapping: list(range(0,1))
+# Reference model processes are now mapped only to GPU 0
+reference.device_mapping: list(range(0,1))
+
+# Significantly reduce the batch sizes for Rollout and Validation stages to prevent out-of-memory errors on a single GPU
+rollout_batch_size: 16
+val_batch_size: 16
+
+# V100 has better native support for FP16 than BF16 (unlike A100/H100). Switching to FP16 improves compatibility and stability, while also saving GPU memory.
+actor_train.model_args.dtype: fp16
+actor_infer.model_args.dtype: fp16
+reference.model_args.dtype: fp16
+
+# Switch the large model training framework from DeepSpeed to Megatron-LM. Parameters can be sent in batches, resulting in faster execution.
+strategy_name: megatron_train
+strategy_config:
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  expert_model_parallel_size: 1
+  use_distributed_optimizer: true
+  recompute_granularity: full
+
+# In megatron training the global train batch size is equivalent to per_device_train_batch_size * gradient_accumulation_steps * world_size
+actor_train.training_args.per_device_train_batch_size: 1
+actor_train.training_args.gradient_accumulation_steps: 16  
+
+# Reduce the maximum number of actions per trajectory, making each Rollout trajectory shorter that reduces the length of LLM-generated content.
+max_actions_per_traj: 10    
+
+# Reduce the number of parallel training and validation environment groups to accommodate single-GPU resources.
+train_env_manager.env_groups: 1
+train_env_manager.n_groups: 1
+val_env_manager.env_groups: 2
+val_env_manager.n_groups: [1, 1]
+val_env_manager.tags: [SimpleSokoban, FrozenLake]
+
+# Reduce the total number of training steps for quicker full pipeline runs, useful for rapid debugging.
+max_steps: 100
+```
diff --git "a/docs_roll/docs/\347\256\200\344\275\223\344\270\255\346\226\207/\345\277\253\351\200\237\345\274\200\345\247\213/alicloud_pipeline_quick_start_cn.md" "b/docs_roll/docs/\347\256\200\344\275\223\344\270\255\346\226\207/\345\277\253\351\200\237\345\274\200\345\247\213/alicloud_pipeline_quick_start_cn.md"
deleted file mode 100644
index cc809ac77..000000000
--- "a/docs_roll/docs/\347\256\200\344\275\223\344\270\255\346\226\207/\345\277\253\351\200\237\345\274\200\345\247\213/alicloud_pipeline_quick_start_cn.md"
+++ /dev/null
@@ -1,147 +0,0 @@
-# 快速上手：阿里云单机版部署指南
-
-## 准备环境
-1. 购买阿里云服务器
-- 单机版本 可选择 GPU：NVIDIA V100
-- 建议您通过ECS控制台购买GPU实例时，同步选中安装GPU驱动
-2. 远程连接GPU实例，进入机器终端
-3. 安装 NVIDIA容器工具包
-```shell
-curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | \
-  sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo
-  
-# 安装 NVIDIA Container Toolkit 软件包
-sudo yum install -y nvidia-container-toolkit
-
-```
-4. 安装 Docker 环境：参考 https://developer.aliyun.com/mirror/docker-ce/
-```shell
-# step 1: 安装必要的一些系统工具
-sudo yum install -y yum-utils
-
-# Step 2: 添加软件源信息
-sudo yum-config-manager --add-repo https://mirrors.aliyun.com/docker-ce/linux/centos/docker-ce.repo
-
-# Step 3: 安装Docker
-sudo yum install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
-
-# Step 4: 开启Docker服务
-sudo service docker start
-
-# 安装校验
-docker version
-```
-
-
-## 环境配置
-```shell
-# 1. 拉取docker镜像
-sudo docker pull <image_address>
-
-# 镜像地址
-# torch2.6.0 + SGlang0.4.6: roll-registry.cn-hangzhou.cr.aliyuncs.com/roll/pytorch:nvcr-24.05-py3-torch260-sglang046
-# torch2.6.0 + vLLM0.8.4: roll-registry.cn-hangzhou.cr.aliyuncs.com/roll/pytorch:nvcr-24.05-py3-torch260-vllm084
-# torch2.5.1 + SGlang0.4.3: roll-registry.cn-hangzhou.cr.aliyuncs.com/roll/pytorch:nvcr-24.05-py3-torch251-sglang043
-# torch2.5.1 + vLLM0.7.3: roll-registry.cn-hangzhou.cr.aliyuncs.com/roll/pytorch:nvcr-24.05-py3-torch251-vllm073
-
-# 2. 启动一个docker容器，指定GPU支持，并始终保持容器运行
-sudo docker images
-sudo docker run -dit \
-  --gpus all \
-  --network=host \
-  --ipc=host \
-  --shm-size=2gb \
-  <image_id> \
-  /bin/bash
-
-# 3. 进入docker容器（下次重新连接也要执行这条命令）
-sudo docker ps
-sudo docker exec -it <container_id> /bin/bash
-
-# 4. 验证GPU是否可见
-nvidia-smi
-
-# 5. 下载代码
-
-# 安装git（该镜像是ubuntu系统），并clone代码
-apt update && apt install git -y
-git clone https://github.com/alibaba/ROLL.git
-
-# 若无法访问github，直接下载zip文件并解压
-wget https://github.com/alibaba/ROLL/archive/refs/heads/main.zip
-unzip main.zip
-
-# 5. 安装依赖（选择对应镜像的requirements文件）
-cd ROLL-main
-pip install -r requirements_torch260_sglang.txt -i https://mirrors.aliyun.com/pypi/simple/
-```
-
-## pipeline运行
-```shell
-# 若执行报错 ModuleNotFoundError: No module named 'roll'，需要添加环境变量
-export PYTHONPATH="/workspace/ROLL-main:$PYTHONPATH"
-
-# 方法一：指定yaml文件路径，需要以脚本目录即examples为根目录
-python examples/start_agentic_pipeline.py --config_path qwen2.5-0.5B-agentic_ds  --config_name agent_val_frozen_lake
-
-# 方法二：直接执行sh脚本
-bash examples/qwen2.5-0.5B-agentic_ds/run_agentic_pipeline_frozen_lake.sh
-
-# 根据需要修改config
-vim examples/qwen2.5-0.5B-agentic_ds/agent_val_frozen_lake.yaml
-```
-
-单卡V100显存config修改要点：
-```yaml
-# 将系统预期的GPU数量从8块减少到你实际拥有的1块V100
-num_gpus_per_node: 1 
-# 训练进程现在只映射到 GPU 0
-actor_train.device_mapping: list(range(0,1))
-# 推理进程现在只映射到 GPU 0
-actor_infer.device_mapping: list(range(0,1))
-# 参考模型进程现在只映射到 GPU 0
-reference.device_mapping: list(range(0,1))
-
-# 大幅减小Rollout阶段/Validation阶段的批量大小，防止单GPU处理大批次时显存不足
-rollout_batch_size: 16
-val_batch_size: 16
-
-# V100 对 FP16 有较好的原生支持，而对 BF16 的支持不如 A100/H100，切换到 FP16 可以提高兼容性和稳定性，同时节省显存。
-actor_train.model_args.dtype: fp16
-actor_infer.model_args.dtype: fp16
-reference.model_args.dtype: fp16
-
-# 大模型训练框架从 DeepSpeed 切换到 Megatron-LM，参数可以批量发送，运行速度更快
-strategy_name: megatron_train
-strategy_config:
-  tensor_model_parallel_size: 1
-  pipeline_model_parallel_size: 1
-  expert_model_parallel_size: 1
-  use_distributed_optimizer: true
-  recompute_granularity: full
-
-# 在 Megatron 训练中，全局训练批次大小是 per_device_train_batch_size * gradient_accumulation_steps * world_size
-actor_train.training_args.per_device_train_batch_size: 1
-actor_train.training_args.gradient_accumulation_steps: 16  
-
-# 减少每条轨迹的最大动作数，使得每个Rollout轨迹更短，减少了LLM生成内容的长度
-max_actions_per_traj: 10    
-
-# 减少并行运行的训练环境组和验证环境组，以适配单GPU资源
-train_env_manager.env_groups: 1
-train_env_manager.n_groups: 1
-val_env_manager.env_groups: 2
-val_env_manager.n_groups: [1, 1]
-val_env_manager.tags: [SimpleSokoban, FrozenLake]
-
-# 减少总的训练步骤，以便更快运行一个完整的训练流程，用于快速调试
-max_steps: 100
-```
-
-pipeline运行中的log截图示例：
-![log1](../../../static/img/log_1.png)
-
-![log2](../../../static/img/log_2.png)
-
-![log3](../../../static/img/log_3.png)
-
diff --git "a/docs_roll/docs/\347\256\200\344\275\223\344\270\255\346\226\207/\345\277\253\351\200\237\345\274\200\345\247\213/single_node_quick_start_cn.md" "b/docs_roll/docs/\347\256\200\344\275\223\344\270\255\346\226\207/\345\277\253\351\200\237\345\274\200\345\247\213/single_node_quick_start_cn.md"
new file mode 100644
index 000000000..c3742e185
--- /dev/null
+++ "b/docs_roll/docs/\347\256\200\344\275\223\344\270\255\346\226\207/\345\277\253\351\200\237\345\274\200\345\247\213/single_node_quick_start_cn.md"
@@ -0,0 +1,96 @@
+# 快速上手：单机版部署指南
+
+## 准备环境
+1. 购买配备GPU的机器，并同步安装GPU驱动
+2. 远程连接GPU实例，进入机器终端
+3. 运行一下命令安装 Docker环境 和 NVIDIA容器工具包
+```shell
+curl -fsSL https://github.com/alibaba/ROLL/blob/main/scripts/install_docker_nvidia_container_toolkit.sh | sudo bash
+```
+
+## 环境配置
+从[镜像地址](https://github.com/alibaba/ROLL/blob/main/docs_roll/docs/English/QuickStart/image_address.md)中选择你需要的Docker镜像，下文均以 *torch2.6.0 + vLLM0.8.4* 为例
+
+# 1. 启动一个docker容器，指定GPU支持，暴露容器端口，并始终保持容器运行
+sudo docker run -dit \
+  --gpus all \
+  -p 9001:22 \
+  --ipc=host \
+  --shm-size=10gb \
+  roll-registry.cn-hangzhou.cr.aliyuncs.com/roll/pytorch:nvcr-24.05-py3-torch260-vllm084 \
+  /bin/bash
+
+# 2. 进入docker容器
+#    您可以使用 `sudo docker ps` 命令查找运行中的容器ID或名称。
+sudo docker exec -it <container_id> /bin/bash
+
+# 3. 验证GPU是否可见
+nvidia-smi
+
+# 4. 克隆项目代码
+git clone https://github.com/alibaba/ROLL.git
+
+# 5. 安装项目依赖（选择对应镜像的requirements文件）
+cd ROLL
+pip install -r requirements_torch260_vllm.txt -i https://mirrors.aliyun.com/pypi/simple/
+```
+
+## pipeline运行
+```shell
+bash examples/agentic_demo/run_agentic_pipeline_frozen_lake_single_node_demo.sh
+```
+
+pipeline运行中的log截图示例：
+![log1](../../../static/img/log_1.png)
+
+![log2](../../../static/img/log_2.png)
+
+![log3](../../../static/img/log_3.png)
+
+
+## 参考：单卡V100显存 config修改要点
+```yaml
+# 将系统预期的GPU数量从8块减少到你实际拥有的1块V100
+num_gpus_per_node: 1 
+# 训练进程现在只映射到 GPU 0
+actor_train.device_mapping: list(range(0,1))
+# 推理进程现在只映射到 GPU 0
+actor_infer.device_mapping: list(range(0,1))
+# 参考模型进程现在只映射到 GPU 0
+reference.device_mapping: list(range(0,1))
+
+# 大幅减小Rollout阶段/Validation阶段的批量大小，防止单GPU处理大批次时显存不足
+rollout_batch_size: 16
+val_batch_size: 16
+
+# V100 对 FP16 有较好的原生支持，而对 BF16 的支持不如 A100/H100，切换到 FP16 可以提高兼容性和稳定性，同时节省显存。
+actor_train.model_args.dtype: fp16
+actor_infer.model_args.dtype: fp16
+reference.model_args.dtype: fp16
+
+# 大模型训练框架从 DeepSpeed 切换到 Megatron-LM，参数可以批量发送，运行速度更快
+strategy_name: megatron_train
+strategy_config:
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+  expert_model_parallel_size: 1
+  use_distributed_optimizer: true
+  recompute_granularity: full
+
+# 在 Megatron 训练中，全局训练批次大小是 per_device_train_batch_size * gradient_accumulation_steps * world_size
+actor_train.training_args.per_device_train_batch_size: 1
+actor_train.training_args.gradient_accumulation_steps: 16  
+
+# 减少每条轨迹的最大动作数，使得每个Rollout轨迹更短，减少了LLM生成内容的长度
+max_actions_per_traj: 10    
+
+# 减少并行运行的训练环境组和验证环境组，以适配单GPU资源
+train_env_manager.env_groups: 1
+train_env_manager.n_groups: 1
+val_env_manager.env_groups: 2
+val_env_manager.n_groups: [1, 1]
+val_env_manager.tags: [SimpleSokoban, FrozenLake]
+
+# 减少总的训练步骤，以便更快运行一个完整的训练流程，用于快速调试
+max_steps: 100
+```
diff --git a/examples/agentic_demo/README.md b/examples/agentic_demo/README.md
new file mode 100644
index 000000000..fed9d8f57
--- /dev/null
+++ b/examples/agentic_demo/README.md
@@ -0,0 +1,2 @@
+**ATTENTION:**
+The config files in this folder are only used for demo/testing, and do not have any effect on the training process.
\ No newline at end of file
diff --git a/examples/agentic_demo/agent_val_frozen_lake_single_node_demo.yaml b/examples/agentic_demo/agent_val_frozen_lake_single_node_demo.yaml
new file mode 100644
index 000000000..357a26dad
--- /dev/null
+++ b/examples/agentic_demo/agent_val_frozen_lake_single_node_demo.yaml
@@ -0,0 +1,193 @@
+defaults:
+  - ../config/envs@_here_
+  - ../config/deepspeed_zero@_here_
+  - ../config/deepspeed_zero2@_here_
+  - ../config/deepspeed_zero3@_here_
+  - ../config/deepspeed_zero3_cpuoffload@_here_
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+exp_name: "agentic_pipeline"
+seed: 42
+logging_dir: ./output/logs
+output_dir: ./output
+render_save_dir: /data/oss_bucket_0/yali/output/render
+system_envs:
+  USE_MODELSCOPE: '1'
+
+#track_with: wandb
+#tracker_kwargs:
+#  api_key:
+#  project: roll-agentic
+#  name: ${exp_name}_frozen_lake
+#  notes: "agentic_pipeline"
+#  tags:
+#    - agentic
+#    - roll
+#    - baseline
+
+
+track_with: tensorboard
+tracker_kwargs:
+  log_dir: /data/oss_bucket_0/yali/llm/tensorboard/roll_exp/agentic_sokoban
+
+num_gpus_per_node: 1
+
+max_steps: 100
+save_steps: 10000
+logging_steps: 1
+eval_steps: 10
+resume_from_checkpoint: false
+
+rollout_batch_size: 16
+val_batch_size: 16
+sequence_length: 4096
+
+reward_clip: 20
+advantage_clip: 10.0
+ppo_epochs: 1
+adv_estimator: "reinforce"
+#pg_clip: 0.1
+#dual_clip_loss: True
+init_kl_coef: 0.0
+whiten_advantages: true
+entropy_loss_coef: 0
+
+pretrain: Qwen/Qwen2.5-0.5B-Instruct
+reward_pretrain: Qwen/Qwen2.5-0.5B-Instruct
+
+actor_train:
+  model_args:
+    flash_attn: fa2
+    disable_gradient_checkpointing: false
+    dtype: fp16
+    model_type: ~
+  training_args:
+    learning_rate: 1.0e-6
+    weight_decay: 0
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 16
+    warmup_steps: 10
+  data_args:
+    template: qwen2_5
+  strategy_args:
+    #strategy_name: deepspeed_train
+    #strategy_config: ${deepspeed_zero2}
+    strategy_name: megatron_train
+    strategy_config:
+      tensor_model_parallel_size: 1
+      pipeline_model_parallel_size: 1
+      expert_model_parallel_size: 1
+      use_distributed_optimizer: true
+      recompute_granularity: full
+  device_mapping: list(range(0,1))
+  infer_batch_size: 1
+
+actor_infer:
+  model_args:
+    flash_attn: fa2
+    disable_gradient_checkpointing: true
+    dtype: fp16
+  generating_args:
+    max_new_tokens: 32 # single-turn response length
+    top_p: 0.99
+    top_k: 100
+    num_beams: 1
+    temperature: 0.99
+    num_return_sequences: 1
+  data_args:
+    template: qwen2_5
+  strategy_args:
+    strategy_name: vllm
+    strategy_config:
+      gpu_memory_utilization: 0.8
+      block_size: 16
+      load_format: auto
+  device_mapping: list(range(0,1))
+  infer_batch_size: 1
+
+reference:
+  model_args:
+    flash_attn: fa2
+    disable_gradient_checkpointing: true
+    dtype: fp16
+    model_type: ~
+  data_args:
+    template: qwen2_5
+  strategy_args:
+    strategy_name: hf_infer
+    strategy_config: ~
+  device_mapping: list(range(0,1))
+  infer_batch_size: 1
+
+enable_response_mask: True
+action_sep: "||"
+use_turn_scores: False # important to GAE when applying token-level rewards to token-level advantages. If False, will take the sum of scores as the reward for the last turn.
+enable_think: False # False -> no think RL
+max_actions_per_traj: 10
+reward_normalization:
+  grouping: tags # 可以tags(env_type)/traj_group_id(group)/batch(rollout_batch)... group_by计算reward/adv
+  method: identity # asym_clip / identity / mean_std
+
+custom_envs:
+  SimpleSokoban:
+    env_type: sokoban
+    max_actions_per_traj:  ${max_actions_per_traj} # used in environment state manager to control the actual max actions executed per trajectory
+    max_steps_per_traj: ${max_actions_per_traj}
+    env_instruction: "You are solving the Sokoban puzzle. You are the player and you need to push all boxes to targets. When you are right next to a box, you can push it by moving in the same direction. You cannot push a box through a wall, and you cannot pull a box. The answer must be one of action in a turn, format is <answer>Right</answer>"
+    max_tokens: 100 # used to curate llm prompt "max words", not used for rollout
+    env_config: # keys should be a subset of SokobanConfig
+      dim_x: 6
+      dim_y: 6
+      num_boxes: 1
+      max_steps: ${max_actions_per_traj}
+  LargerSokoban:
+    env_type: sokoban
+    max_actions_per_traj:  ${max_actions_per_traj}
+    max_steps_per_traj: ${max_actions_per_traj}
+    env_instruction: "You are solving the Sokoban puzzle. You are the player and you need to push all boxes to targets. When you are right next to a box, you can push it by moving in the same direction. You cannot push a box through a wall, and you cannot pull a box. The answer must be one of action in a turn, format is <answer>Right</answer>"
+    max_tokens: 100
+    env_config:
+      dim_x: 8
+      dim_y: 8
+      num_boxes: 2
+      max_steps: ${max_actions_per_traj}
+      search_depth: 10
+  SokobanDifferentGridVocab:
+    env_type: sokoban
+    max_actions_per_traj:  ${max_actions_per_traj}
+    max_steps_per_traj: ${max_actions_per_traj}
+    env_instruction: "You are solving the Sokoban puzzle. You are the player and you need to push all boxes to targets. When you are right next to a box, you can push it by moving in the same direction. You cannot push a box through a wall, and you cannot pull a box. The answer must be one of action in a turn, format is <answer>Right</answer>"
+    max_tokens: 100
+    env_config: # keys should be a subset of SokobanConfig
+      search_depth: 30
+      dim_x: 6
+      dim_y: 6
+      num_boxes: 1
+      max_steps: ${max_actions_per_traj}
+      grid_lookup: { 0: "W", 1: ".", 2: "G", 3: "C", 4: "B", 5: "A", 6: "@" }
+      grid_vocab: { "W": "wall", ".": "empty", "G": "target", "C": "box on target", "B": "box", "A": "player", "@": "player on target" }
+  FrozenLake:
+    env_type: frozen_lake
+    max_actions_per_traj:  ${max_actions_per_traj}
+    max_steps_per_traj: ${max_actions_per_traj}
+    env_instruction: "You are solving the FrozenLake puzzle. Forbid the whole and go to the target. You may move to the unintended direction due to the slippery ice. The answer must be one of action in a turn, format is <answer>Right</answer>"
+    max_tokens: 100
+    env_config:
+      is_slippery: false
+
+train_env_manager:
+  format_penalty: -0.001
+  env_groups: 1
+  group_size: 1
+  tags: [FrozenLake]
+  n_groups: [1] # If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation
+
+val_env_manager:
+  env_groups: 2
+  group_size: 1 # should be set to 1 because val temperature is set to 0 and same prompt leads to same output
+  tags: [SimpleSokoban, FrozenLake]
+  n_groups: [1, 1] # TODO: If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation
diff --git a/examples/agentic_demo/run_agentic_pipeline_frozen_lake_single_node_demo.sh b/examples/agentic_demo/run_agentic_pipeline_frozen_lake_single_node_demo.sh
new file mode 100755
index 000000000..74a4f3fdf
--- /dev/null
+++ b/examples/agentic_demo/run_agentic_pipeline_frozen_lake_single_node_demo.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+set +x
+
+ROLL_PATH="/workspace/ROLL-main"
+CONFIG_PATH=$(basename $(dirname $0))
+export PYTHONPATH="$ROLL_PATH:$PYTHONPATH"
+python examples/start_agentic_pipeline.py --config_path $CONFIG_PATH  --config_name agent_val_frozen_lake_single_node_demo
diff --git a/scripts/install_docker_nvidia_container_toolkit.sh b/scripts/install_docker_nvidia_container_toolkit.sh
new file mode 100644
index 000000000..ebc455489
--- /dev/null
+++ b/scripts/install_docker_nvidia_container_toolkit.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+set -e
+
+echo "[+] Installing system utilities..."
+sudo yum install -y yum-utils || echo "[!] Failed to install yum-utils (try manually)"
+
+echo "[+] Adding Docker repository..."
+sudo yum-config-manager --add-repo https://mirrors.aliyun.com/docker-ce/linux/centos/docker-ce.repo || echo "[!] Failed to add Docker repo (check network)"
+
+echo "[+] Installing Docker components..."
+sudo yum install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin || echo "[!] Failed to install Docker (check dependencies)"
+
+echo "[+] Starting Docker service..."
+sudo systemctl start docker || echo "[!] Failed to start Docker (check logs with 'systemctl status docker')"
+
+echo "[+] Configuring NVIDIA container repository..."
+curl -sSL https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo || echo "[!] Failed to configure NVIDIA repo"
+
+echo "[+] Installing NVIDIA Container Toolkit..."
+sudo yum install -y nvidia-container-toolkit || echo "[!] Failed to install NVIDIA Container Toolkit (check repo)"
+
+echo "[+] Restarting Docker service..."
+sudo systemctl restart docker || echo "[!] Failed to restart Docker (check configuration)"
+
+# These steps are not necessary 
+echo
+echo "==> Verifying installation (optional)..."
+
+echo "[i] Checking Docker service status..."
+systemctl is-active docker | grep -q "active" && echo "[✓] Docker service is running" || echo "[!] Docker service may not be active"
+
+echo "[i] Checking NVIDIA runtime configuration..."
+grep -q '"default-runtime": "nvidia"' /etc/docker/daemon.json && echo "[✓] NVIDIA runtime is configured" || echo "[!] NVIDIA runtime not found in daemon.json"
+
+echo "[i] Testing GPU support (run manually if needed):"
+echo "   docker run --rm --gpus all nvidia/cuda:12.4.0-base-ubuntu22.04 nvidia-smi"
\ No newline at end of file