andylin-hao
diff --git a/‎.github/workflows/ci-tests.yml‎
Lines changed: 9 additions & 1 deletion b/‎.github/workflows/ci-tests.yml‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎.github/workflows/sft-e2e-tests.yml‎
Lines changed: 33 additions & 0 deletions b/‎.github/workflows/sft-e2e-tests.yml‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 2 additions & 2 deletions b/‎README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.zh-CN.md‎
Lines changed: 2 additions & 2 deletions b/‎README.zh-CN.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/source-en/rst_source/examples/index.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/source-en/rst_source/examples/index.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source-en/rst_source/examples/sft.rst‎
Lines changed: 112 additions & 0 deletions b/‎docs/source-en/rst_source/examples/sft.rst‎
Lines changed: 112 additions & 0 deletions
diff --git a/‎docs/source-zh/rst_source/examples/index.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/source-zh/rst_source/examples/index.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source-zh/rst_source/examples/sft.rst‎
Lines changed: 114 additions & 0 deletions b/‎docs/source-zh/rst_source/examples/sft.rst‎
Lines changed: 114 additions & 0 deletions
diff --git a/‎examples/sft/config/custom_sft_openpi.yaml‎
Lines changed: 77 additions & 0 deletions b/‎examples/sft/config/custom_sft_openpi.yaml‎
Lines changed: 77 additions & 0 deletions
@@ -95,6 +95,13 @@ jobs:
     if: needs.check-changes.outputs.file_filter == 'true' || needs.check-changes.outputs.install_filter == 'true'
     uses: ./.github/workflows/embodied-e2e-tests.yml
 
+  # =============================================== sft e2e tests ====================================================
+
+  sft-e2e-tests:
+    needs: [check-changes]
+    if: needs.check-changes.outputs.file_filter == 'true' || needs.check-changes.outputs.install_filter == 'true'
+    uses: ./.github/workflows/sft-e2e-tests.yml
+
   # =============================================== scheduler tests ====================================================
 
   scheduler-tests:
@@ -112,7 +119,8 @@ jobs:
       unit-tests,
       agent-reason-e2e-tests,
       embodied-e2e-tests,
-      scheduler-tests
+      scheduler-tests,
+      sft-e2e-tests
     ]
     if: always()
     runs-on: ubuntu-latest
 
@@ -0,0 +1,33 @@
+name: SFT End-to-End Tests
+
+on:
+  workflow_call:
+
+jobs:
+  sft-maniskill-openpi-test:
+    runs-on: embodied
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - name: Create sft environment
+        run: |
+          unset UV_DEFAULT_INDEX
+          export UV_PATH=/workspace/dataset/.uv
+          export UV_LINK_MODE=symlink
+          export UV_CACHE_DIR=/workspace/dataset/.uv_cache
+          export UV_PYTHON_INSTALL_DIR=/workspace/dataset/.uv_python
+          export LIBERO_PATH=/workspace/dataset/LIBERO
+          bash requirements/install.sh embodied --model openpi --env maniskill_libero
+
+      - name: SFT ManiSkill OpenPI test
+        timeout-minutes: 20
+        run: |
+          export REPO_PATH=$(pwd)
+          source .venv/bin/activate
+          bash tests/e2e_tests/sft/run.sh maniskill_sft_openpi
+
+      - name: Clean up
+        run: |
+          rm -rf .venv
+          uv cache prune
@@ -114,8 +114,8 @@ RLinf is a flexible and scalable open-source infrastructure designed for post-tr
           </ul>
           <li><b>SFT</b></li>
           <ul>
-            <li>Full-parameter SFT</li>
-            <li>LoRA SFT</li>
+            <li><a href="https://rlinf.readthedocs.io/en/latest/rst_source/examples/fine_tine.html">Full-parameter SFT</a> ✅</li>
+            <li><a href="https://rlinf.readthedocs.io/en/latest/rst_source/examples/fine_tine.html">LoRA SFT</a> ✅</li>
           </ul>
         </ul>
       </td>
 
@@ -113,8 +113,8 @@ RLinf 是一个灵活且可扩展的开源框架，专为利用强化学习进
           </ul>
           <li><b>SFT</b></li>
           <ul>
-            <li>全量微调</li>
-            <li>LoRA微调</li>
+            <li><a href="https://rlinf.readthedocs.io/zh-cn/latest/rst_source/examples/fine_tine.html">全量微调</a> ✅</li>
+            <li><a href="https://rlinf.readthedocs.io/zh-cn/latest/rst_source/examples/fine_tine.html">LoRA微调</a> ✅</li>
           </ul>
         </ul>
       </td>
 
@@ -257,3 +257,4 @@ Thanks to this decoupled design, workers can be flexibly and dynamically schedul
    gr00t
    reasoning
    coding_online_rl
+   sft
@@ -0,0 +1,112 @@
+Supervised Fine-Tuning
+=======================
+
+.. |huggingface| image:: /_static/svg/hf-logo.svg
+   :width: 16px
+   :height: 16px
+   :class: inline-icon
+
+This page explains how to run **full-parameter supervised fine-tuning (SFT)** and **LoRA fine-tuning** with the RLinf framework. SFT is typically the first stage before reinforcement learning: the model imitates high-quality examples so RL can continue optimization with a strong prior.
+
+Contents
+----------
+
+- How to configure full-parameter SFT and LoRA SFT in RLinf
+- How to launch training on a single machine or multi-node cluster
+- How to monitor and evaluate results
+
+
+Supported datasets
+--------------------
+
+RLinf currently supports datasets in the LeRobot format, selected via **config_type**.
+
+Supported formats include:
+
+- pi0_maniskill
+- pi0_libero
+- pi05_libero
+- pi05_maniskill
+- pi05_metaworld
+- pi05_calvin
+
+You can also train with a custom dataset format. Refer to the files below:
+
+1. In ``examples/sft/config/custom_sft_openpi.yaml``, set the data format.
+
+.. code:: yaml
+
+  model:
+    openpi:
+      config_name: "pi0_custom"
+
+2. In ``rlinf/models/embodiment/openpi/__init__.py``, set the data format to ``pi0_custom``.
+
+.. code:: python
+
+    TrainConfig(
+        name="pi0_custom",
+        model=pi0_config.Pi0Config(),
+        data=CustomDataConfig(
+            repo_id="physical-intelligence/custom_dataset",
+            base_config=DataConfig(
+                prompt_from_task=True
+            ),  # we need language instruction
+            assets=AssetsConfig(assets_dir="checkpoints/torch/pi0_base/assets"),
+            extra_delta_transform=True,  # True for delta action, False for abs_action
+            action_train_with_rotation_6d=False,  # User can add extra config in custom dataset
+        ),
+        pytorch_weight_path="checkpoints/torch/pi0_base",
+    ),
+
+3. In ``rlinf/models/embodiment/openpi/dataconfig/custom_dataconfig.py``, define the custom dataset config.
+
+.. code:: python
+
+    class CustomDataConfig(DataConfig):
+        def __init__(self, *args, **kwargs):
+            super().__init__(*args, **kwargs)
+            self.repo_id = "physical-intelligence/custom_dataset"
+            self.base_config = DataConfig(
+                prompt_from_task=True
+            )
+            self.assets = AssetsConfig(assets_dir="checkpoints/torch/pi0_base/assets")
+            self.extra_delta_transform = True
+            self.action_train_with_rotation_6d = False
+
+
+Training configuration
+----------------------
+
+A full example lives in ``examples/sft/config/libero_sft_openpi.yaml``. Key fields:
+
+.. code:: yaml
+
+    cluster:
+        num_nodes: 1                 # number of nodes
+        component_placement:         # component → GPU mapping
+            actor: 0-3
+
+To enable LoRA fine-tuning, set ``actor.model.is_lora`` to True and configure ``actor.model.lora_rank``.
+
+.. code:: yaml
+
+    actor:
+        model:
+            is_lora: True
+            lora_rank: 32
+
+Launch scripts
+----------------
+
+First start the Ray cluster, then run the helper script:
+
+.. code:: bash
+
+   cd /path_to_RLinf/ray_utils
+   bash start_ray.sh                 # start head + workers
+
+   # return to repo root
+   bash examples/sft/train_embodied_sft.py --config libero_sft_openpi.yaml
+
+The same script works for generic text SFT; just swap the config file.
@@ -252,3 +252,4 @@ RLinf的整体设计简洁且模块化，以Worker为抽象封装强化学习训
    gr00t
    reasoning
    coding_online_rl
+   sft
@@ -0,0 +1,114 @@
+监督微调训练
+=======================
+
+.. |huggingface| image:: /_static/svg/hf-logo.svg
+   :width: 16px
+   :height: 16px
+   :class: inline-icon
+
+本文档介绍如何在 RLinf 框架中进行 **全量监督微调（Full-parameter SFT）** 和 **LoRA 微调**。SFT 通常作为进入强化学习前的第一阶段：模型先模仿高质量示例，后续强化学习才能在良好先验上继续优化。
+
+内容包括
+--------
+
+- 如何在 RLinf 中配置通用全量监督微调 和 LoRA微调
+- 如何在单机或多节点集群上启动训练
+- 如何监控与评估结果
+
+
+支持的数据集
+------------------
+
+RLinf 目前支持 LeRobot 格式的数据集，可以通过 **config_type** 指定不同的数据集类型。
+
+目前支持的数据格式包括：
+
+- pi0_maniskill
+- pi0_libero
+- pi05_libero
+- pi05_maniskill
+- pi05_metaworld
+- pi05_calvin
+
+也可通过自定义数据集格式来训练特定数据集，具体可参考以下文件
+
+1. 在``examples/sft/config/custom_sft_openpi.yaml``中，指定数据格。
+
+.. code:: yaml
+
+    model:
+    openpi:
+        config_name: "pi0_custom"
+
+2. 在``rlinf/models/embodiment/openpi/__init__.py``中，指定数据格式为 ``pi0_custom``。
+
+.. code:: python
+
+    TrainConfig(
+        name="pi0_custom",
+        model=pi0_config.Pi0Config(),
+        data=CustomDataConfig(
+            repo_id="physical-intelligence/custom_dataset",
+            base_config=DataConfig(
+                prompt_from_task=True
+            ),  # we need language instruction
+            assets=AssetsConfig(assets_dir="checkpoints/torch/pi0_base/assets"),
+            extra_delta_transform=True,  # True for delta action, False for abs_action
+            action_train_with_rotation_6d=False,  # User can add extra config in custom dataset
+        ),
+        pytorch_weight_path="checkpoints/torch/pi0_base",
+    ),
+
+3. 在``rlinf/models/embodiment/openpi/dataconfig/custom_dataconfig.py``中，定义自定义数据集的配置。
+
+.. code:: python
+
+    class CustomDataConfig(DataConfig):
+        def __init__(self, *args, **kwargs):
+            super().__init__(*args, **kwargs)
+            self.repo_id = "physical-intelligence/custom_dataset"
+            self.base_config = DataConfig(
+                prompt_from_task=True
+            )
+            self.assets = AssetsConfig(assets_dir="checkpoints/torch/pi0_base/assets")
+            self.extra_delta_transform = True
+            self.action_train_with_rotation_6d = False
+
+
+训练配置
+-------------
+
+完整示例配置位于 ``examples/sft/config/libero_sft_openpi.yaml``，核心字段如下：
+
+.. code:: yaml
+
+    cluster:
+        num_nodes: 1                 # 节点数
+        component_placement:         # 组件 → GPU 映射
+            actor: 0-3
+
+若需要支持LoRA微调，需要将``actor.model.is_lora``设置为True，并配置``actor.model.lora_rank``参数。
+
+.. code:: yaml
+
+    actor:
+        model:
+            is_lora: True
+            lora_rank: 32
+
+启动脚本
+-------------
+
+先启动 Ray 集群，然后执行辅助脚本：
+
+.. code:: bash
+
+   cd /path_to_RLinf/ray_utils
+   bash start_ray.sh                 # 启动 head + workers
+
+   # 回到仓库根目录
+   bash examples/sft/train_embodied_sft.py --config libero_sft_openpi.yaml
+
+同一脚本也适用于通用文本 SFT，只需替换配置文件。
+
+
@@ -0,0 +1,77 @@
+defaults:
+  - model/pi0@actor.model
+  - training_backend/fsdp@actor.fsdp_config
+  - override hydra/job_logging: stdout
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+  searchpath:
+    - file://${oc.env:EMBODIED_PATH}/config/
+
+cluster:
+  num_nodes: 1
+  component_placement:
+    actor,env,rollout: 0-0
+
+runner:
+  task_type: sft
+  logger:
+    log_path: "../results"
+    project_name: rlinf
+    experiment_name: "test_openpi"
+    logger_backends: ["tensorboard"] # wandb, swanlab
+
+  max_epochs: 1000
+  max_steps: -1
+  val_check_interval: -1
+  save_interval: 10
+
+data:
+  data_path: "/path/to/custom-data"
+
+algorithm:
+  adv_type: gae
+
+actor:
+  group_name: "ActorGroup"
+  training_backend: "fsdp"
+  micro_batch_size: 1
+  global_batch_size: 16
+  seed: 0
+
+  # Override the default values in model/pi0
+  model:
+    precision: null
+    model_path: "/path/to/pi0-model"
+    num_action_chunks: 4 # interface for the env
+    add_value_head: True
+    openpi:
+      config_name: "pi0_custom"
+      detach_critic_input: True
+
+  optim:
+    lr: 7.91e-6
+    value_lr: 1.55e-4
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-05
+    clip_grad: 1.0
+
+  # Override the default values in training_backend/fsdp
+  fsdp_config:
+    strategy: "fsdp"
+    sharding_strategy: "no_shard"
+    use_orig_params: True
+    gradient_checkpointing: False # for openpi, gradient checkpointing is not supported, please do not change this value
+    mixed_precision:
+      param_dtype: ${actor.model.precision}
+      reduce_dtype: ${actor.model.precision}
+      buffer_dtype: ${actor.model.precision}
+
+reward:
+  use_reward_model: False
+
+critic:
+  use_critic_model: False