stanford-crfm · dlwh · Apr 19, 2022 · Apr 19, 2022 · Apr 19, 2022 · Apr 19, 2022
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -17,7 +17,7 @@ repos:
     -   id: check-added-large-files
 
 -   repo: https://github.com/psf/black
-    rev: 21.8b0
+    rev: 22.3.0
     hooks:
     -   id: black
 

diff --git a/README.md b/README.md
@@ -60,7 +60,7 @@ For single-node single-gpu training, run:
 ```bash
 conda activate mistral
 cd mistral
-CUDA_VISIBLE_DEVICES=0 python train.py --config conf/tutorial-gpt2-micro.yaml --nnodes 1 --nproc_per_node 1 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 2 --run_id tutorial-gpt2-micro
+CUDA_VISIBLE_DEVICES=0 python train.py --file conf/tutorial-gpt2-micro.yaml --nnodes 1 --nproc_per_node 1 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 2 --run_id tutorial-gpt2-micro
 ```
 
 #### Multi-node multi-GPU training with DeepSpeed
@@ -86,7 +86,7 @@ To start distributed training, run:
 ```bash
 conda activate mistral
 cd mistral
-deepspeed --num_gpus 8 --num_nodes 2 --master_addr machine1 train.py --config conf/tutorial-gpt2-micro.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 4 --training_arguments.deepspeed conf/deepspeed/z2-small-conf.json --run_id tutorial-gpt2-micro-multi-node
+deepspeed --num_gpus 8 --num_nodes 2 --master_addr machine1 train.py --file conf/tutorial-gpt2-micro.yaml --nnodes 2 --nproc_per_node 8 --training_arguments.fp16 true --training_arguments.per_device_train_batch_size 4 --training_arguments.deepspeed conf/deepspeed/z2-small-conf.json --run_id tutorial-gpt2-micro-multi-node
 ```
 
 Note: You may need to adjust your batch size depending on the capacity of your GPUs.

diff --git a/conf/archive/old-benchmarking/gpt2-benchmark-config.yaml b/conf/archive/old-benchmarking/gpt2-benchmark-config.yaml
@@ -4,7 +4,7 @@
 #   Inheritance and core paths can all be overridden from the command line or by re-writing these files.
 ---
 # Inherit Dataset, Tokenization, Model, and Training Details
-inherit:
+inherits:
     - datasets/openwebtext.yaml
     - models/gpt2-small.yaml
     - trainers/benchmark.yaml

diff --git a/conf/archive/old-benchmarking/gpt2-intensive-config.yaml b/conf/archive/old-benchmarking/gpt2-intensive-config.yaml
@@ -6,7 +6,7 @@
 #   Inheritance and core paths can all be overridden from the command line or by re-writing these files.
 ---
 # Inherit Dataset, Tokenization, Model, and Training Details
-inherit:
+inherits:
     - datasets/openwebtext.yaml
     - models/gpt2-small.yaml
     - trainers/intensive.yaml

diff --git a/conf/archive/old-benchmarking/gpt2-toy-config.yaml b/conf/archive/old-benchmarking/gpt2-toy-config.yaml
@@ -3,7 +3,7 @@
 #   Trainer. Inheritance and core paths can all be overridden from the command line or by re-writing these files.
 ---
 # Inherit Dataset, Tokenization, Model, and Training Details
-inherit:
+inherits:
     - datasets/wikitext103.yaml
     - models/gpt2-small.yaml
     - trainers/toy.yaml

diff --git a/conf/archive/partial-checkpointing/gpt2-mistral-medium-gcheck-config.yaml b/conf/archive/partial-checkpointing/gpt2-mistral-medium-gcheck-config.yaml
@@ -5,7 +5,7 @@
 #   Inheritance and core paths can all be overridden from the command line or by re-writing these files.
 ---
 # Inherit Dataset, Tokenization, Model, and Training Details
-inherit:
+inherits:
     - ../../datasets/wikitext103.yaml
     - ../../models/gpt2-medium.yaml
     - ../../trainers/gpt2-medium.yaml

diff --git a/conf/datasets/wikitext103.yaml b/conf/datasets/wikitext103.yaml
@@ -4,7 +4,7 @@
 dataset:
     id: wikitext
     name: wikitext-103-raw-v1
-    validation_ratio: null
+    validation_ratio: 0.005
 
     # Number of Preprocessing Workers
     num_proc: 4

diff --git a/conf/gpt2-debug-config.yaml b/conf/gpt2-debug-config.yaml
@@ -3,8 +3,8 @@
 #   Trainer. Inheritance and core paths can all be overridden from the command line or by re-writing these files.
 ---
 # Inherit Dataset, Tokenization, Model, and Training Details
-inherit:
-    - datasets/openwebtext.yaml
+inherits:
+    - datasets/wikitext103.yaml
     - models/gpt2-small.yaml
     - trainers/debug.yaml
 

diff --git a/conf/gpt2-mistral-medium-config.yaml b/conf/gpt2-mistral-medium-config.yaml
@@ -5,7 +5,7 @@
 #   Inheritance and core paths can all be overridden from the command line or by re-writing these files.
 ---
 # Inherit Dataset, Tokenization, Model, and Training Details
-inherit:
+inherits:
     - datasets/openwebtext.yaml
     - models/gpt2-medium.yaml
     - trainers/gpt2-medium.yaml

diff --git a/conf/gpt2-mistral-medium-gcp-config.yaml b/conf/gpt2-mistral-medium-gcp-config.yaml
@@ -6,7 +6,7 @@
 #   Inheritance and core paths can all be overridden from the command line or by re-writing these files.
 ---
 # Inherit Dataset, Tokenization, Model, and Training Details
-inherit:
+inherits:
     - datasets/openwebtext.yaml
     - models/gpt2-medium.yaml
     - trainers/gpt2-medium.yaml

diff --git a/conf/gpt2-mistral-mini-config.yaml b/conf/gpt2-mistral-mini-config.yaml
@@ -5,7 +5,7 @@
 #   Inheritance and core paths can all be overridden from the command line or by re-writing these files.
 ---
 # Inherit Dataset, Tokenization, Model, and Training Details
-inherit:
+inherits:
     - datasets/openwebtext.yaml
     - models/gpt2-mini.yaml
     - trainers/gpt2-small.yaml

diff --git a/conf/gpt2-mistral-small-config.yaml b/conf/gpt2-mistral-small-config.yaml
@@ -5,7 +5,7 @@
 #   Inheritance and core paths can all be overridden from the command line or by re-writing these files.
 ---
 # Inherit Dataset, Tokenization, Model, and Training Details
-inherit:
+inherits:
     - datasets/openwebtext.yaml
     - models/gpt2-small.yaml
     - trainers/gpt2-small.yaml

diff --git a/conf/gpt2-mistral-small-gcp-config.yaml b/conf/gpt2-mistral-small-gcp-config.yaml
@@ -5,7 +5,7 @@
 #   Inheritance and core paths can all be overridden from the command line or by re-writing these files.
 ---
 # Inherit Dataset, Tokenization, Model, and Training Details
-inherit:
+inherits:
     - datasets/openwebtext.yaml
     - models/gpt2-small.yaml
     - trainers/gpt2-small.yaml

diff --git a/conf/gpt2-scaling-config.yaml b/conf/gpt2-scaling-config.yaml
@@ -6,7 +6,7 @@
 #   Inheritance and core paths can all be overridden from the command line or by re-writing these files.
 ---
 # Inherit Dataset, Tokenization, Model, and Training Details
-inherit:
+inherits:
     - datasets/wikitext103.yaml
     - models/gpt2-small.yaml
     - trainers/gpt2-small.yaml

diff --git a/conf/train_schema.py b/conf/train_schema.py
diff --git a/conf/trainers/benchmark.yaml b/conf/trainers/benchmark.yaml
@@ -6,7 +6,6 @@
 ---
 training_arguments:
     # Overwrite from Top-Level Config
-    output_dir: null
 
     # Generally sticks to order from HF.TrainingArguments() Docs, skipping over sane defaults/implicitly set args...
     do_train: true
@@ -16,9 +15,6 @@ training_arguments:
     per_device_train_batch_size: 2
     per_device_eval_batch_size: 2
 
-    # We set this dynamically based on DDP Computation [steps = effective_batch / (per_gpu_batch * gpus * nodes)]
-    gradient_accumulation_steps: null
-
     # For Online Evaluation, only keep around the Losses
     prediction_loss_only: true
 
@@ -52,16 +48,8 @@ training_arguments:
     # Resume Behavior --> ignore "full determinism" on resume (saves time for debugging)
     ignore_data_skip: false
 
-    # Seeds -- Should be Overwritten at Runtime!
-    seed: null
-
     ### Optimization -- Precision, DeepSpeed, and FairScale Parameters -- all off for `simple` config
     fp16: false
-    sharded_ddp: null
-    deepspeed: null
 
     # Dataloader Parallelism
     dataloader_num_workers: 4
-
-    # Should be overwritten from the Top-Level Config or CLI!
-    local_rank: null
diff --git a/conf/trainers/debug.yaml b/conf/trainers/debug.yaml
@@ -6,7 +6,6 @@
 ---
 training_arguments:
     # Overwrite from Top-Level Config
-    output_dir: null
 
     # Generally sticks to order from HF.TrainingArguments() Docs, skipping over sane defaults/implicitly set args...
     do_train: true
@@ -16,9 +15,6 @@ training_arguments:
     per_device_train_batch_size: 16
     per_device_eval_batch_size: 16
 
-    # We set this dynamically based on DDP Computation [steps = effective_batch / (per_gpu_batch * gpus * nodes)]
-    gradient_accumulation_steps: null
-
     # For Online Evaluation, only keep around the Losses
     prediction_loss_only: true
 
@@ -52,16 +48,8 @@ training_arguments:
     # Resume Behavior --> ignore "full determinism" on resume (saves time for debugging)
     ignore_data_skip: false
 
-    # Seeds -- Should be Overwritten at Runtime!
-    seed: null
-
     ### Optimization -- Precision, DeepSpeed, and FairScale Parameters -- all off for `simple` config
     fp16: true
-    sharded_ddp: null
-    deepspeed: null
 
     # Dataloader Parallelism
     dataloader_num_workers: 4
-
-    # Should be overwritten from the Top-Level Config or CLI!
-    local_rank: null
diff --git a/conf/trainers/gpt2-medium.yaml b/conf/trainers/gpt2-medium.yaml
@@ -6,7 +6,6 @@
 ---
 training_arguments:
     # Overwrite from Top-Level Config
-    output_dir: null
 
     # Generally sticks to order from HF.TrainingArguments() Docs, skipping over sane defaults/implicitly set args...
     do_train: true
@@ -16,9 +15,6 @@ training_arguments:
     per_device_train_batch_size: 4
     per_device_eval_batch_size: 16
 
-    # We set this dynamically based on DDP Computation [steps = effective_batch / (per_gpu_batch * gpus * nodes)]
-    gradient_accumulation_steps: null
-
     # Boolean whether to use Gradient Checkpointing to save GPU Memory at the expense of runtime
     gradient_checkpointing: false
 
@@ -56,16 +52,8 @@ training_arguments:
     # Resume Behavior --> ignore "full determinism" on resume (saves time for debugging)
     ignore_data_skip: false
 
-    # Seeds -- Should be Overwritten at Runtime!
-    seed: null
-
     ### Optimization -- Precision, DeepSpeed, and FairScale Parameters -- all off for `simple` config
     fp16: true
-    sharded_ddp: null
-    deepspeed: null
 
     # Dataloader Parallelism
     dataloader_num_workers: 4
-
-    # Should be overwritten from the Top-Level Config or CLI!
-    local_rank: null