Enable LoRA for all linear layers in tinyllama (#1122)

rasbt · rasbt · commit fc1244155f8b · 2024-03-18T13:07:18.000-05:00
diff --git a/config_hub/finetune/README.md b/config_hub/finetune/README.md
@@ -1,10 +1,11 @@
 ## Config files
 
-The table below lists the performances you can expect from the provided config files. Note that you can achieve lower memory consumption by lowering the micro batch size as needed. See the [Dealing with out-of-memory (OOM) errors](../../tutorials/oom.md) on lowering the memory requirements.
+The table below lists the performances you can expect from the provided config files. Note that you can achieve lower memory consumption by lowering the micro batch size as needed. In addition, you can lower the rank (`lora_r`) in the LoRA configuration files and disable LoRA for certain layers (for example, setting `lora_projection` and other LoRA layer-specific parameters to `false`). 
+For more information, see the [Dealing with out-of-memory (OOM) errors](../../tutorials/oom.md) on lowering the memory requirements.
 
 |                       | Size | Dataset   | Epochs | Val loss | Peak memory | Max seq length | Micro batch size | Precision | Training runtime |
 | --------------------- | ---- | --------- | ------ | -------- | ----------- | -------------- | ---------------- | --------- | ---------------- |
-| tiny-llama/lora.yaml  | 1.1B | Alpaca 2k | 4      | 1.053    | 10.54 GB    | 512            | 8                | bfloat16  | 9.24 min (A10G)  |
-| tiny-llama/qlora.yaml | 1.1B | Alpaca 2k | 4      | 1.074    | 13.32 GB    | 512            | 8                | bfloat16  | 9.89 min (A10G)  |
+| tiny-llama/lora.yaml  | 1.1B | Alpaca 2k | 3      | 1.038    | 13.50 GB    | 512            | 8                | bfloat16  | 8.06 min (A10G)  |
+| tiny-llama/qlora.yaml | 1.1B | Alpaca 2k | 3      | 1.056    | 16.24 GB    | 512            | 8                | bfloat16  | 8.74 min (A10G)  |
 | tiny-llama/full.yaml  | 1.1B | Alpaca 2k | 1      | 1.105    | 14.10 GB    | 512            | 4                | bfloat16  | 2.59 min (A10G)  |
 | llama-2-7b/qlora.yaml | 7B   | Alpaca 2k | 4      | 0.814    | 13.68 GB    | 512            | 2                | bfloat16  | 45.68 min (A10G) |
diff --git a/config_hub/finetune/tiny-llama/lora.yaml b/config_hub/finetune/tiny-llama/lora.yaml
@@ -27,19 +27,19 @@ lora_dropout: 0.05
 lora_query: true
 
 # Whether to apply LoRA to the key weights in attention. (type: bool, default: False)
-lora_key: false
+lora_key: true
 
 # Whether to apply LoRA to the value weights in attention. (type: bool, default: True)
 lora_value: true
 
 # Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False)
-lora_projection: false
+lora_projection: true
 
 # Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False)
-lora_mlp: false
+lora_mlp: true
 
 # Whether to apply LoRA to output head in GPT. (type: bool, default: False)
-lora_head: false
+lora_head: true
 
 # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
 data:
@@ -71,7 +71,7 @@ train:
   lr_warmup_steps: 10
 
   # Number of epochs to train on (type: Optional[int], default: 5)
-  epochs: 4
+  epochs: 3
 
   # Total number of tokens to train on (type: Optional[int], default: null)
   max_tokens:
diff --git a/config_hub/finetune/tiny-llama/qlora.yaml b/config_hub/finetune/tiny-llama/qlora.yaml
@@ -27,19 +27,19 @@ lora_dropout: 0.05
 lora_query: true
 
 # Whether to apply LoRA to the key weights in attention. (type: bool, default: False)
-lora_key: false
+lora_key: true
 
 # Whether to apply LoRA to the value weights in attention. (type: bool, default: True)
 lora_value: true
 
 # Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False)
-lora_projection: false
+lora_projection: true
 
 # Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False)
-lora_mlp: false
+lora_mlp: true
 
 # Whether to apply LoRA to output head in GPT. (type: bool, default: False)
-lora_head: false
+lora_head: true
 
 # Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
 data:
@@ -71,7 +71,7 @@ train:
   lr_warmup_steps: 10
 
   # Number of epochs to train on (type: Optional[int], default: 5)
-  epochs: 4
+  epochs: 3
 
   # Total number of tokens to train on (type: Optional[int], default: null)
   max_tokens: