pytorch
diff --git a/‎.github/unittest/linux_examples/scripts/run_test.sh‎
Lines changed: 3 additions & 1 deletion b/‎.github/unittest/linux_examples/scripts/run_test.sh‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎.github/unittest/linux_libs/scripts_rlhf/run_test.sh‎
Lines changed: 9 additions & 0 deletions b/‎.github/unittest/linux_libs/scripts_rlhf/run_test.sh‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎examples/rlhf/.gitignore‎
Lines changed: 4 additions & 0 deletions b/‎examples/rlhf/.gitignore‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎examples/rlhf/README.md‎
Lines changed: 57 additions & 0 deletions b/‎examples/rlhf/README.md‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎examples/rlhf/config/train.yaml‎
Lines changed: 30 additions & 0 deletions b/‎examples/rlhf/config/train.yaml‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎examples/rlhf/config/train_reward.yaml‎
Lines changed: 32 additions & 0 deletions b/‎examples/rlhf/config/train_reward.yaml‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎examples/rlhf/config/train_rlhf.yaml‎
Lines changed: 39 additions & 0 deletions b/‎examples/rlhf/config/train_rlhf.yaml‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎examples/rlhf/data/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎examples/rlhf/data/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎examples/rlhf/models/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎examples/rlhf/models/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎examples/rlhf/models/actor_critic.py‎
Lines changed: 35 additions & 0 deletions b/‎examples/rlhf/models/actor_critic.py‎
Lines changed: 35 additions & 0 deletions
@@ -282,8 +282,10 @@ python .github/unittest/helpers/coverage_run_parallel.py examples/multiagent/sac
   train.minibatch_size=100 \
   logger.backend=
 
-
 python .github/unittest/helpers/coverage_run_parallel.py examples/bandits/dqn.py --n_steps=100
 
+## RLHF
+# RLHF tests are executed in the dedicated workflow
+
 coverage combine
 coverage xml -i
@@ -22,5 +22,14 @@ conda deactivate && conda activate ./env
 python -c "import transformers, datasets"
 
 python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/test_rlhf.py --instafail -v --durations 200 --capture no --error-for-skips
+
+python .github/unittest/helpers/coverage_run_parallel.py examples/rlhf/train_rlhf.py \
+  sys.device=cuda:0 sys.ref_device=cuda:0 \
+  model.name_or_path=gpt2 train.max_epochs=2 \
+  data.batch_size=2 train.ppo.ppo_batch_size=2 \
+  train.ppo.ppo_num_epochs=1 reward_model.name_or_path= \
+  train.ppo.episode_length=8 train.ppo.num_rollouts_per_epoch=4 \
+  data.block_size=110 io.logger=csv
+
 coverage combine
 coverage xml -i
@@ -0,0 +1,4 @@
+*.png
+*.bin
+*.pt
+*.json
@@ -0,0 +1,57 @@
+# RLHF example
+
+This example uses RLHF (Reinforcement Learning with Human Feedback) to train a 
+language model to summarize Reddit posts.
+
+## Getting started
+
+Make sure you have PyTorch>=2.0 installed. You can find installation instructions
+[here](https://pytorch.org/get-started/locally/).
+
+From this directory, you can install extra requirements for running these
+examples with
+
+```sh
+pip install -r requirements.txt
+```
+
+## Training the models
+### Training the transformer
+
+Once the data has been prepared, you can train the GPT model.
+
+```sh
+python train.py
+```
+
+Default configuration can be found in `config/train.yaml`, and any option can
+be overridden with command-line arguments, for example to run the training
+script with a different batch size:
+
+```sh
+python train.py --batch_size=128
+```
+> **_NOTE:_**  Apple Silicon Macbooks users make sure to use `--device=mps`
+> and prepend all commands with `PYTORCH_ENABLE_MPS_FALLBACK=1` to enable CPU fallback
+
+### Training the reward model
+
+Once you have completed supervised fine-tuning, copy the desired model
+checkpoint to `./out` or update the config to point `model.name_or_path` at
+the relevant checkpoint in the timestamped working directory created by Hydra.
+You can then train the reward model with:
+
+```sh
+python train_reward.py
+```
+
+### Training the final model with RLHF
+
+Once again, make sure you have either updated the configuration to point
+`reward_model.name_or_path` at the relevant timestamped working directory, or
+copy the checkpoint to `./out_reward`.
+You can then train the final model by running
+
+```sh
+python train_rlhf.py
+```
@@ -0,0 +1,30 @@
+io:
+  eval_interval: 200
+  log_interval: 50
+  eval_iters: 100
+data:
+  batch_size: 16  # if gradient_accumulation_steps > 1, this is the micro-batch size
+  block_size: 550
+model:
+  name_or_path: gpt2  # gpt2 for pre-trained, local path for checkpoint
+  out_dir: ./out
+  dropout: 0.1  # for pretraining 0 is good, for finetuning try 0.1+
+train:
+  grad_clip: 1.0  # clip gradients at this value, or disable if == 0.0
+  max_iters: 5000  # total number of training iterations
+  gradient_accumulation_steps: 2  # used to simulate larger batch sizes
+  always_save_checkpoint: False  # if True, always save a checkpoint after each evaluation in out_dir
+  decay_lr: True  # whether to decay the learning rate
+  optimizer:
+    # keyword arguments for torch.optim.AdamW
+    lr: 1.0e-5
+    weight_decay: 1.0e-1
+    betas: [0.9, 0.95]
+  scheduler:
+    # keyword arguments for torch.optim.lr_scheduler.CosineAnnealingLR
+    T_max: 5000  # maximum number of iterations
+    eta_min: 1.0e-6  # minimum learning rate
+sys:
+  device: cuda  # examples: cpu, cuda, cuda:0, cuda:1 etc., or try mps on macbooks
+  dtype: bfloat16  # float32, bfloat16, or float16, the latter will auto implement a GradScaler
+  compile: True  # use PyTorch 2.0 to compile the model to be faster
@@ -0,0 +1,32 @@
+io:
+  eval_interval: 200
+  log_interval: 50
+  eval_iters: 100
+data:
+  batch_size: 16 # if gradient_accumulation_steps > 1, this is the micro-batch size
+  block_size: 550
+model:
+  name_or_path: ./out
+  dropout: 0.1 # for pretraining 0 is good, for finetuning try 0.1+
+reward_model:
+  out_dir: ./out_reward
+  init_from: scratch  # 'scratch' or 'resume' - if "resume" model will be loaded from out_dir_reward
+train:
+  grad_clip: 1.0 # clip gradients at this value, or disable if == 0.0
+  max_iters: 20000 # total number of training iterations
+  gradient_accumulation_steps: 2 # used to simulate larger batch sizes
+  always_save_checkpoint: False # if True, always save a checkpoint after each eval
+  decay_lr: False # whether to decay the learning rate
+  optimizer:
+    # keyword arguments for torch.optim.AdamW
+    lr: 1.0e-5
+    weight_decay: 1.0e-1
+    betas: [0.9, 0.95]
+  scheduler:
+    # keyword arguments for torch.optim.lr_scheduler.CosineAnnealingLR
+    T_max: 20000
+    eta_min: 1.0e-6
+sys:
+  device: cuda # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
+  dtype: bfloat16 # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
+  compile: True # use PyTorch 2.0 to compile the model to be faster
@@ -0,0 +1,39 @@
+io:
+  eval_interval: 6
+  log_interval: 1
+  eval_iters: 10
+  logger: wandb
+data:
+  batch_size: 4 # if gradient_accumulation_steps > 1, this is the micro-batch size
+  block_size: 550
+  num_workers: 1
+model:
+  name_or_path: ./out
+  out_dir: ./out_rlhf
+  dropout: 0.1 # for pretraining 0 is good, for finetuning try 0.1+
+reward_model:
+  name_or_path: ./out_reward
+train:
+  grad_clip: 1.0
+  max_epochs: 1000 # total number of training iterations
+  always_save_checkpoint: True # if True, always save a checkpoint after each eval
+  decay_lr: True
+  optimizer:
+    # keyword arguments for torch.optim.AdamW
+    lr: 5.0e-5
+    weight_decay: 0.0  # 01
+    betas: [0.9, 0.999]
+  scheduler:
+    # keyword arguments for torch.optim.lr_scheduler.CosineAnnealingLR
+    T_max: 3000  # max_epochs * num_rollouts / ppo_batch_size
+    eta_min: 5.0e-6
+  ppo:
+    episode_length: 50
+    ppo_batch_size: 16
+    ppo_num_epochs: 3
+    num_rollouts_per_epoch: 32
+sys:
+  device: cuda # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
+  ref_device: cuda:1  # device of reference model
+  dtype: bfloat16 # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
+  compile: False # use PyTorch 2.0 to compile the model to be faster
@@ -0,0 +1,3 @@
+from torchrl.data.rlhf.prompt import get_prompt_dataloader_tldr
+
+__all__ = ["get_prompt_dataloader_tldr"]
@@ -0,0 +1,4 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
@@ -0,0 +1,35 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from torchrl.modules.tensordict_module.actors import LMHeadActorValueOperator
+from torchrl.modules.tensordict_module.common import VmapModule
+
+from .transformer import init_transformer
+
+__all__ = ["init_actor_critic"]
+
+
+def init_actor_critic(model_cfg, sys_cfg):
+
+    transformer_name_or_path = model_cfg.name_or_path
+    dropout = model_cfg.dropout
+
+    device = sys_cfg.device
+    compile_model = sys_cfg.compile
+    base_model = init_transformer(
+        transformer_name_or_path,
+        dropout,
+        device,
+        as_tensordictmodule=False,
+        compile_model=compile_model,
+        inference=True,
+    )
+    model = LMHeadActorValueOperator(base_model)
+    model.to(device)
+    model.eval()
+    actor = model.get_policy_operator()
+    critic = model.get_value_operator()
+    critic_head = model.get_value_head()
+
+    return actor, VmapModule(critic), critic_head, base_model
-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +*.png
 +*.bin
 +*.pt
 +*.json
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from torchrl.data.rlhf.prompt import get_prompt_dataloader_tldr`
	`2`	`+`
	`3`	`+__all__ = ["get_prompt_dataloader_tldr"]`