Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ repos:
- id: check-added-large-files

- repo: https://github.com/psf/black
rev: 21.8b0
rev: 22.3.0
hooks:
- id: black

Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ For single-node single-gpu training, run:
```bash
conda activate mistral
cd mistral
CUDA_VISIBLE_DEVICES=0 python train.py --config conf/tutorial-gpt2-micro.yaml --nnodes 1 --nproc_per_node 1 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 2 --run_id tutorial-gpt2-micro
CUDA_VISIBLE_DEVICES=0 python train.py --file conf/tutorial-gpt2-micro.yaml --nnodes 1 --nproc_per_node 1 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 2 --run_id tutorial-gpt2-micro
```

#### Multi-node multi-GPU training with DeepSpeed
Expand All @@ -86,7 +86,7 @@ To start distributed training, run:
```bash
conda activate mistral
cd mistral
deepspeed --num_gpus 8 --num_nodes 2 --master_addr machine1 train.py --config conf/tutorial-gpt2-micro.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 4 --training_arguments.deepspeed conf/deepspeed/z2-small-conf.json --run_id tutorial-gpt2-micro-multi-node
deepspeed --num_gpus 8 --num_nodes 2 --master_addr machine1 train.py --file conf/tutorial-gpt2-micro.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 4 --training_arguments.deepspeed conf/deepspeed/z2-small-conf.json --run_id tutorial-gpt2-micro-multi-node
```

Note: You may need to adjust your batch size depending on the capacity of your GPUs.
Expand Down
2 changes: 1 addition & 1 deletion conf/archive/old-benchmarking/gpt2-benchmark-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# Inheritance and core paths can all be overridden from the command line or by re-writing these files.
---
# Inherit Dataset, Tokenization, Model, and Training Details
inherit:
inherits:
- datasets/openwebtext.yaml
- models/gpt2-small.yaml
- trainers/benchmark.yaml
Expand Down
2 changes: 1 addition & 1 deletion conf/archive/old-benchmarking/gpt2-intensive-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# Inheritance and core paths can all be overridden from the command line or by re-writing these files.
---
# Inherit Dataset, Tokenization, Model, and Training Details
inherit:
inherits:
- datasets/openwebtext.yaml
- models/gpt2-small.yaml
- trainers/intensive.yaml
Expand Down
2 changes: 1 addition & 1 deletion conf/archive/old-benchmarking/gpt2-toy-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# Trainer. Inheritance and core paths can all be overridden from the command line or by re-writing these files.
---
# Inherit Dataset, Tokenization, Model, and Training Details
inherit:
inherits:
- datasets/wikitext103.yaml
- models/gpt2-small.yaml
- trainers/toy.yaml
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# Inheritance and core paths can all be overridden from the command line or by re-writing these files.
---
# Inherit Dataset, Tokenization, Model, and Training Details
inherit:
inherits:
- ../../datasets/wikitext103.yaml
- ../../models/gpt2-medium.yaml
- ../../trainers/gpt2-medium.yaml
Expand Down
2 changes: 1 addition & 1 deletion conf/datasets/wikitext103.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
dataset:
id: wikitext
name: wikitext-103-raw-v1
validation_ratio: null
validation_ratio: 0.005

# Number of Preprocessing Workers
num_proc: 4
Expand Down
4 changes: 2 additions & 2 deletions conf/gpt2-debug-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
# Trainer. Inheritance and core paths can all be overridden from the command line or by re-writing these files.
---
# Inherit Dataset, Tokenization, Model, and Training Details
inherit:
- datasets/openwebtext.yaml
inherits:
- datasets/wikitext103.yaml
- models/gpt2-small.yaml
- trainers/debug.yaml

Expand Down
2 changes: 1 addition & 1 deletion conf/gpt2-mistral-medium-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# Inheritance and core paths can all be overridden from the command line or by re-writing these files.
---
# Inherit Dataset, Tokenization, Model, and Training Details
inherit:
inherits:
- datasets/openwebtext.yaml
- models/gpt2-medium.yaml
- trainers/gpt2-medium.yaml
Expand Down
2 changes: 1 addition & 1 deletion conf/gpt2-mistral-medium-gcp-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# Inheritance and core paths can all be overridden from the command line or by re-writing these files.
---
# Inherit Dataset, Tokenization, Model, and Training Details
inherit:
inherits:
- datasets/openwebtext.yaml
- models/gpt2-medium.yaml
- trainers/gpt2-medium.yaml
Expand Down
2 changes: 1 addition & 1 deletion conf/gpt2-mistral-mini-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# Inheritance and core paths can all be overridden from the command line or by re-writing these files.
---
# Inherit Dataset, Tokenization, Model, and Training Details
inherit:
inherits:
- datasets/openwebtext.yaml
- models/gpt2-mini.yaml
- trainers/gpt2-small.yaml
Expand Down
2 changes: 1 addition & 1 deletion conf/gpt2-mistral-small-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# Inheritance and core paths can all be overridden from the command line or by re-writing these files.
---
# Inherit Dataset, Tokenization, Model, and Training Details
inherit:
inherits:
- datasets/openwebtext.yaml
- models/gpt2-small.yaml
- trainers/gpt2-small.yaml
Expand Down
2 changes: 1 addition & 1 deletion conf/gpt2-mistral-small-gcp-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# Inheritance and core paths can all be overridden from the command line or by re-writing these files.
---
# Inherit Dataset, Tokenization, Model, and Training Details
inherit:
inherits:
- datasets/openwebtext.yaml
- models/gpt2-small.yaml
- trainers/gpt2-small.yaml
Expand Down
2 changes: 1 addition & 1 deletion conf/gpt2-scaling-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# Inheritance and core paths can all be overridden from the command line or by re-writing these files.
---
# Inherit Dataset, Tokenization, Model, and Training Details
inherit:
inherits:
- datasets/wikitext103.yaml
- models/gpt2-small.yaml
- trainers/gpt2-small.yaml
Expand Down
137 changes: 0 additions & 137 deletions conf/train_schema.py

This file was deleted.

12 changes: 0 additions & 12 deletions conf/trainers/benchmark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
---
training_arguments:
# Overwrite from Top-Level Config
output_dir: null

# Generally sticks to order from HF.TrainingArguments() Docs, skipping over sane defaults/implicitly set args...
do_train: true
Expand All @@ -16,9 +15,6 @@ training_arguments:
per_device_train_batch_size: 2
per_device_eval_batch_size: 2

# We set this dynamically based on DDP Computation [steps = effective_batch / (per_gpu_batch * gpus * nodes)]
gradient_accumulation_steps: null

# For Online Evaluation, only keep around the Losses
prediction_loss_only: true

Expand Down Expand Up @@ -52,16 +48,8 @@ training_arguments:
# Resume Behavior --> ignore "full determinism" on resume (saves time for debugging)
ignore_data_skip: false

# Seeds -- Should be Overwritten at Runtime!
seed: null

### Optimization -- Precision, DeepSpeed, and FairScale Parameters -- all off for `simple` config
fp16: false
sharded_ddp: null
deepspeed: null

# Dataloader Parallelism
dataloader_num_workers: 4

# Should be overwritten from the Top-Level Config or CLI!
local_rank: null
12 changes: 0 additions & 12 deletions conf/trainers/debug.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
---
training_arguments:
# Overwrite from Top-Level Config
output_dir: null

# Generally sticks to order from HF.TrainingArguments() Docs, skipping over sane defaults/implicitly set args...
do_train: true
Expand All @@ -16,9 +15,6 @@ training_arguments:
per_device_train_batch_size: 16
per_device_eval_batch_size: 16

# We set this dynamically based on DDP Computation [steps = effective_batch / (per_gpu_batch * gpus * nodes)]
gradient_accumulation_steps: null

# For Online Evaluation, only keep around the Losses
prediction_loss_only: true

Expand Down Expand Up @@ -52,16 +48,8 @@ training_arguments:
# Resume Behavior --> ignore "full determinism" on resume (saves time for debugging)
ignore_data_skip: false

# Seeds -- Should be Overwritten at Runtime!
seed: null

### Optimization -- Precision, DeepSpeed, and FairScale Parameters -- all off for `simple` config
fp16: true
sharded_ddp: null
deepspeed: null

# Dataloader Parallelism
dataloader_num_workers: 4

# Should be overwritten from the Top-Level Config or CLI!
local_rank: null
12 changes: 0 additions & 12 deletions conf/trainers/gpt2-medium.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
---
training_arguments:
# Overwrite from Top-Level Config
output_dir: null

# Generally sticks to order from HF.TrainingArguments() Docs, skipping over sane defaults/implicitly set args...
do_train: true
Expand All @@ -16,9 +15,6 @@ training_arguments:
per_device_train_batch_size: 4
per_device_eval_batch_size: 16

# We set this dynamically based on DDP Computation [steps = effective_batch / (per_gpu_batch * gpus * nodes)]
gradient_accumulation_steps: null

# Boolean whether to use Gradient Checkpointing to save GPU Memory at the expense of runtime
gradient_checkpointing: false

Expand Down Expand Up @@ -56,16 +52,8 @@ training_arguments:
# Resume Behavior --> ignore "full determinism" on resume (saves time for debugging)
ignore_data_skip: false

# Seeds -- Should be Overwritten at Runtime!
seed: null

### Optimization -- Precision, DeepSpeed, and FairScale Parameters -- all off for `simple` config
fp16: true
sharded_ddp: null
deepspeed: null

# Dataloader Parallelism
dataloader_num_workers: 4

# Should be overwritten from the Top-Level Config or CLI!
local_rank: null
Loading