HYLcool
diff --git a/‎benchmark/config/countdown-template.yaml‎
Lines changed: 0 additions & 2 deletions b/‎benchmark/config/countdown-template.yaml‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎benchmark/config/gsm8k-template.yaml‎
Lines changed: 0 additions & 2 deletions b/‎benchmark/config/gsm8k-template.yaml‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎docs/sphinx_doc/source/_templates/versions.html‎
Lines changed: 2 additions & 2 deletions b/‎docs/sphinx_doc/source/_templates/versions.html‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/sphinx_doc/source/tutorial/example_mix_algo.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/sphinx_doc/source/tutorial/example_mix_algo.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/sphinx_doc/source/tutorial/example_step_wise.md‎
Lines changed: 0 additions & 2 deletions b/‎docs/sphinx_doc/source/tutorial/example_step_wise.md‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎docs/sphinx_doc/source/tutorial/faq.md‎
Lines changed: 1 addition & 2 deletions b/‎docs/sphinx_doc/source/tutorial/faq.md‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎docs/sphinx_doc/source/tutorial/trinity_configs.md‎
Lines changed: 35 additions & 48 deletions b/‎docs/sphinx_doc/source/tutorial/trinity_configs.md‎
Lines changed: 35 additions & 48 deletions
diff --git a/‎docs/sphinx_doc/source/tutorial/trinity_programming_guide.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/sphinx_doc/source/tutorial/trinity_programming_guide.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/RAFT_alfworld/RAFT_alfworld_7B.yaml‎
Lines changed: 0 additions & 2 deletions b/‎examples/RAFT_alfworld/RAFT_alfworld_7B.yaml‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎examples/RAFT_alfworld/RAFT_reflect_alfworld_7B.yaml‎
Lines changed: 0 additions & 2 deletions b/‎examples/RAFT_alfworld/RAFT_reflect_alfworld_7B.yaml‎
Lines changed: 0 additions & 2 deletions
@@ -46,8 +46,6 @@ buffer:
         priority_fn: linear_decay
         decay: 0.1
     sft_warmup_steps: 0
-  max_retry_times: 3
-  max_retry_interval: 1
 explorer:
   runner_num: 32
   max_timeout: 900
 
@@ -51,8 +51,6 @@ buffer:
         priority_fn: linear_decay
         decay: 0.1
     sft_warmup_steps: 0
-  max_retry_times: 3
-  max_retry_interval: 1
 explorer:
   runner_per_model: 8
   max_timeout: 900
 
@@ -2,7 +2,7 @@
 <div class="rst-versions" data-toggle="rst-versions" role="note" aria-label="versions">
   <span class="rst-current-version" data-toggle="rst-current-version">
     <span class="fa fa-book"> Other Versions</span>
-    v: {{ current_version.name }}
+    <b>{{ current_version.name }}</b>
     <span class="fa fa-caret-down"></span>
   </span>
   <div class="rst-other-versions">
@@ -18,7 +18,7 @@
     <dl>
       <dt>Branches</dt>
       {%- for item in versions.branches %}
-      <dd><a href="{{ item.url }}">{{ item.name }}</a> <b>(latest)</b></dd>
+      <dd><b><a href="{{ item.url }}">{{ item.name }}</a> (latest)</b></dd>
       {%- endfor %}
     </dl>
     {%- endif %}
 
@@ -54,7 +54,7 @@ class MIXAlgorithm(AlgorithmType):
     use_reference: bool = True
     compute_advantage_in_trainer: bool = False
     can_balance_batch: bool = True
-    schema: type = ExperienceModel
+    schema: str = "experience"
 
     @classmethod
     def default_config(cls) -> Dict:
 
@@ -107,8 +107,6 @@ buffer:
   total_epochs: 20
   batch_size: 16
   train_batch_size: 7680  # here: batch_size * repeat_times * max_env_steps
-  max_retry_times: 3
-  max_retry_interval: 1
   explorer_input:
     taskset:
       name: alfworld
 
@@ -120,7 +120,7 @@ from sqlalchemy import create_engine
 from sqlalchemy.exc import OperationalError
 from sqlalchemy.orm import sessionmaker
 from sqlalchemy.pool import NullPool
-from trinity.common.schema import ExperienceModel
+from trinity.common.schema.sql_schema import ExperienceModel
 
 engine = create_engine(buffer.trainer_input.experience_buffer.path)
 session = sessionmaker(bind=engine)
@@ -129,7 +129,6 @@ sess = session()
 MAX_EXPERIENCES = 4
 experiences = (
     sess.query(ExperienceModel)
-    .with_for_update()
     .limit(MAX_EXPERIENCES)
     .all()
 )
 
@@ -35,7 +35,7 @@ synchronizer:
   # Model weight synchronization settings
   ...
 monitor:
-  # Monitoring configurations (e.g., WandB or TensorBoard)
+  # Monitoring configurations (e.g., WandB, TensorBoard or MLFlow)
   ...
 service:
   # Services to use
@@ -48,10 +48,12 @@ log:
   ...
 ```
 
-Each of these sections will be explained in detail below.
+Each of these sections will be explained in detail below. For additional details about specific parameters not covered here, please refer to the [source code](https://github.com/modelscope/Trinity-RFT/blob/main/trinity/common/config.py).
 
-```{note}
-For additional details about specific parameters not covered here, please refer to the [source code](https://github.com/modelscope/Trinity-RFT/blob/main/trinity/common/config.py).
+```{tip}
+Trinity-RFT uses [OmegaConf](https://omegaconf.readthedocs.io/en/latest/) to load YAML configuration files.
+It supports some advanced features like [variable interpolation](https://omegaconf.readthedocs.io/en/latest/usage.html#variable-interpolation) and  [environment variable substitution](https://omegaconf.readthedocs.io/en/latest/custom_resolvers.html#oc-env).
+Users can use these features to simplify configuration.
 ```
 
 ---
@@ -64,7 +66,7 @@ These are general settings that apply to the entire experiment.
 project: Trinity-RFT
 name: example
 mode: both
-checkpoint_root_dir: /PATH/TO/CHECKPOINT
+checkpoint_root_dir: ${oc.env:CHECKPOINT_ROOT_DIR}   # CHECKPOINT_ROOT_DIR is an environment variable set in advance
 ```
 
 - `project`: The name of the project.
@@ -115,13 +117,25 @@ Used to log training metrics during execution.
 ```yaml
 monitor:
   monitor_type: wandb
+  monitor_args:
+    base_url: http://localhost:8080
+    api_key: your_api_key
   enable_ray_timeline: False
 ```
 
 - `monitor_type`: Type of monitoring system. Options:
   - `wandb`: Logs to [Weights & Biases](https://docs.wandb.ai/quickstart/). Requires logging in and setting `WANDB_API_KEY`. Project and run names match the `project` and `name` fields in global configs.
   - `tensorboard`: Logs to [TensorBoard](https://www.tensorflow.org/tensorboard). Files are saved under `<checkpoint_root_dir>/<project>/<name>/monitor/tensorboard`.
-- `enable_ray_timeline`: Whether to export the ray timeline. If set to `True`, a `timeline.json` file will be exported to `<checkpoint_root_dir>/<project>/<name>/monitor`. You can view the timeline file in Chrome at [chrome://tracing](chrome://tracing).
+  - `mlflow`: Logs to [MLFlow](https://mlflow.org/). If [MLFlow authentication](https://mlflow.org/docs/latest/ml/auth/) is setup, set `MLFLOW_TRACKING_USERNAME` and `MLFLOW_TRACKING_PASSWORD` as environment variables before running.
+- `monitor_args`: Dictionary of arguments for monitor initialization.
+  - For `wandb`:
+    - `base_url`: Overrides `WANDB_BASE_URL` if set.
+    - `api_key`: Overrides `WANDB_API_KEY` if set.
+  - For `mlflow`:
+    - `uri`: The URI of your MLFlow instance. Strongly recommended to set; defaults to `http://localhost:5000`.
+    - `username`: Overrides `MLFLOW_TRACKING_USERNAME` if set.
+    - `password`: Overrides `MLFLOW_TRACKING_PASSWORD` if set.
+- `enable_ray_timeline`: If `True`, exports a `timeline.json` file to `<checkpoint_root_dir>/<project>/<name>/monitor`. Viewable in Chrome at [chrome://tracing](chrome://tracing).
 
 ---
 
@@ -131,8 +145,8 @@ Defines the model paths and token limits.
 
 ```yaml
 model:
-  model_path: /PATH/TO/MODEL/
-  critic_model_path: ''
+  model_path: ${oc.env:MODEL_PATH}  # MODEL_PATH is an environment variable set in advance
+  critic_model_path: ${model.model_path}  # use the value of model.model_path
   max_response_tokens: 16384
   max_model_len: 20480
 ```
@@ -174,10 +188,6 @@ buffer:
       ...
     eval_tasksets:
       ...
-
-  explorer_output:
-    ...
-
   trainer_input:
     experience_buffer:
       ...
@@ -255,41 +265,6 @@ The configuration for each task dataset is defined as follows:
 - `default_reward_fn_type`: Reward function used during exploration. If not specified, the `buffer.default_reward_fn_type` is used.
 - `workflow_args`: A dictionary of arguments used to supplement dataset-level parameters.
 
-
-### Explorer Output
-
-In [`explore` mode](#global-configuration), since there is no trainer, users can configure an experience buffer via `buffer.explorer_output`, rather than using `buffer.trainer_input`, which will be introduced in the next section.
-
-```{note}
-For `both` and `train` modes, users should use `buffer.trainer_input.experience_buffer` instead of `buffer.explorer_output`.
-```
-
-```yaml
-buffer:
-  ...
-  explorer_output:
-    name: countdown_buffer
-    storage_type: queue
-    path: sqlite:///countdown_buffer.db
-    wrap_in_ray: True
-    max_read_timeout: 1800
-```
-
-- `name`: The name of the experience buffer. This name will be used as the Ray actor's name, so it must be unique.
-- `storage_type`: The storage type for the experience buffer.
-  - `queue`: Experience data is stored in a queue. This storage type is recommended for most use cases.
-  - `sql`: Experience data is stored in a SQL database. If your database only supports local access (e.g., SQLite), set `wrap_in_ray` to `True` to wrap the database in a Ray actor, enabling remote access from other nodes.
-  - `file`: Experience data is stored in a JSON file. This storage type should be used only for debugging purposes in `explore` mode.
-- `path`: The path to the experience buffer.
-  - For `queue` storage type, this field is optional. You can specify a SQLite database or JSON file path here to back up the queue data.
-  - For `file` storage type, the path points to the directory containing the dataset files.
-  - For `sql` storage type, the path points to the SQLite database file.
-- `wrap_in_ray`: Whether to wrap the experience buffer in a Ray actor. Only take effect when `storage_type` is `sql` or `file`. The `queue` storage always uses a Ray actor.
-- `max_read_timeout`: The maximum waiting time (in seconds) to read new experience data. If exceeded, an incomplete batch will be returned directly. Only take effect when `storage_type` is `queue`. Default is 1800 seconds (30 minutes).
-- `use_priority_queue`: Only take effect when `storage_type` is `queue`. If set to `True`, the queue will be a priority queue, which allows for prioritizing certain experiences over others. Default is `False`.
-- `reuse_cooldown_time`: Only take effect when `storage_type` is `queue` and `use_priority_queue` is `True`. If set, it specifies the cooldown time (in seconds) for reusing experiences. If not specified, the default value is `None`, meaning experiences can not be reused.
-
-
 ### Trainer Input
 
 Defines the experience buffer and optional SFT warm-up dataset.
@@ -314,7 +289,19 @@ buffer:
     sft_warmup_steps: 0
 ```
 
-- `experience_buffer`: Experience buffer used by the trainer, which is logically equivalent to `buffer.explorer_output`.
+- `experience_buffer`: It is the input of Trainer and also the output of Explorer. This field is required even in explore mode.
+  - `name`: The name of the experience buffer. This name will be used as the Ray actor's name, so it must be unique.
+  - `storage_type`: The storage type for the experience buffer.
+    - `queue`: Experience data is stored in a queue. This storage type is recommended for most use cases.
+    - `sql`: Experience data is stored in a SQL database.
+    - `file`: Experience data is stored in a JSON file. This storage type should be used only for debugging purposes in `explore` mode.
+  - `path`: The path to the experience buffer.
+    - For `queue` storage type, this field is optional. You can specify a SQLite database or JSON file path here to back up the queue data.
+    - For `file` storage type, the path points to the directory containing the dataset files.
+    - For `sql` storage type, the path points to the SQLite database file.
+  - `max_read_timeout`: The maximum waiting time (in seconds) to read new experience data. If exceeded, an incomplete batch will be returned directly. Only take effect when `storage_type` is `queue`. Default is 1800 seconds (30 minutes).
+  - `use_priority_queue`: Only take effect when `storage_type` is `queue`. If set to `True`, the queue will be a priority queue, which allows for prioritizing certain experiences over others. Default is `False`.
+  - `reuse_cooldown_time`: Only take effect when `storage_type` is `queue` and `use_priority_queue` is `True`. If set, it specifies the cooldown time (in seconds) for reusing experiences. If not specified, the default value is `None`, meaning experiences can not be reused.
 - `sft_warmup_dataset`: Optional dataset used for pre-training (SFT warmup).
 - `sft_warmup_steps`: Number of steps to use SFT warm-up before RL begins.
 
 
@@ -447,7 +447,7 @@ class OPMDPolicyLossFn(PolicyLossFn):
 
 The above steps implement the components needed for the algorithm, but these components are scattered and need to be configured in multiple places to take effect.
 
-To simplify configuration, Trinity-RFT provides {class}`trinity.algorithm.AlgorithmType` to describe a complete algorithm and registers it in {object}`trinity.algorithm.ALGORITHM_TYPE`, enabling one-click configuration.
+To simplify configuration, Trinity-RFT provides {class}`trinity.algorithm.AlgorithmType` to describe a complete algorithm and registers it in {class}`trinity.algorithm.ALGORITHM_TYPE`, enabling one-click configuration.
 
 The `AlgorithmType` class includes the following attributes and methods:
 
@@ -473,7 +473,7 @@ class OPMDAlgorithm(AlgorithmType):
     use_reference: bool = True
     compute_advantage_in_trainer: bool = False
     can_balance_batch: bool = True
-    schema: type = ExperienceModel
+    schema: str = "experience"
 
     @classmethod
     def default_config(cls) -> Dict:
 
@@ -15,8 +15,6 @@ cluster:
 buffer:
   total_epochs: 30
   batch_size: 80
-  max_retry_times: 1
-  max_retry_interval: 1
   explorer_input:
     taskset:
       name: alfworld-train
 
@@ -15,8 +15,6 @@ cluster:
 buffer:
   total_epochs: 30
   batch_size: 80
-  max_retry_times: 1
-  max_retry_interval: 1
   explorer_input:
     taskset:
       name: alfworld-train