expose and set lora_dropout = 0.0 (#1492)

felipemello1 · Felipe Mello · web-flow · commit d7fae9677a92 · 2024-09-11T14:59:32.000-07:00
Co-authored-by: Felipe Mello &lt;felipemello@fb.com&gt;
diff --git a/recipes/configs/code_llama2/7B_lora_single_device.yaml b/recipes/configs/code_llama2/7B_lora_single_device.yaml
@@ -23,6 +23,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 # Tokenizer
 tokenizer:
diff --git a/recipes/configs/code_llama2/7B_qlora_single_device.yaml b/recipes/configs/code_llama2/7B_qlora_single_device.yaml
@@ -23,6 +23,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 # Tokenizer
 tokenizer:
diff --git a/recipes/configs/gemma/2B_lora.yaml b/recipes/configs/gemma/2B_lora.yaml
@@ -34,6 +34,7 @@ model:
   apply_lora_to_mlp: True
   lora_rank: 64
   lora_alpha: 16
+  lora_dropout: 0.0
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
diff --git a/recipes/configs/gemma/2B_lora_single_device.yaml b/recipes/configs/gemma/2B_lora_single_device.yaml
@@ -33,6 +33,7 @@ model:
   apply_lora_to_mlp: True
   lora_rank: 64
   lora_alpha: 16
+  lora_dropout: 0.0
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
diff --git a/recipes/configs/gemma/2B_qlora_single_device.yaml b/recipes/configs/gemma/2B_qlora_single_device.yaml
@@ -33,6 +33,7 @@ model:
   apply_lora_to_mlp: True
   lora_rank: 64
   lora_alpha: 16
+  lora_dropout: 0.0
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
diff --git a/recipes/configs/gemma/7B_lora.yaml b/recipes/configs/gemma/7B_lora.yaml
@@ -34,6 +34,7 @@ model:
   apply_lora_to_mlp: True
   lora_rank: 64
   lora_alpha: 16
+  lora_dropout: 0.0
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
diff --git a/recipes/configs/gemma/7B_lora_single_device.yaml b/recipes/configs/gemma/7B_lora_single_device.yaml
@@ -33,6 +33,7 @@ model:
   apply_lora_to_mlp: True
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
diff --git a/recipes/configs/gemma/7B_qlora_single_device.yaml b/recipes/configs/gemma/7B_qlora_single_device.yaml
@@ -33,6 +33,7 @@ model:
   apply_lora_to_mlp: True
   lora_rank: 64
   lora_alpha: 16
+  lora_dropout: 0.0
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
diff --git a/recipes/configs/llama2/13B_lora.yaml b/recipes/configs/llama2/13B_lora.yaml
@@ -27,6 +27,7 @@ model:
   apply_lora_to_output: True
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
diff --git a/recipes/configs/llama2/13B_qlora_single_device.yaml b/recipes/configs/llama2/13B_qlora_single_device.yaml
@@ -23,6 +23,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 tokenizer:
   _component_: torchtune.models.llama2.llama2_tokenizer
diff --git a/recipes/configs/llama2/70B_lora.yaml b/recipes/configs/llama2/70B_lora.yaml
@@ -17,6 +17,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 16
   lora_alpha: 32
+  lora_dropout: 0.0
 
 tokenizer:
   _component_: torchtune.models.llama2.llama2_tokenizer
diff --git a/recipes/configs/llama2/70B_qlora.yaml b/recipes/configs/llama2/70B_qlora.yaml
@@ -22,6 +22,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 16
   lora_alpha: 32
+  lora_dropout: 0.0
 
 tokenizer:
   _component_: torchtune.models.llama2.llama2_tokenizer
diff --git a/recipes/configs/llama2/7B_lora.yaml b/recipes/configs/llama2/7B_lora.yaml
@@ -26,6 +26,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 tokenizer:
   _component_: torchtune.models.llama2.llama2_tokenizer
diff --git a/recipes/configs/llama2/7B_lora_dpo.yaml b/recipes/configs/llama2/7B_lora_dpo.yaml
@@ -25,6 +25,7 @@ model:
   lora_rank: 8
   lora_alpha: 16
   lora_dropout: 0.0
+  lora_dropout: 0.0
 
 # Tokenizer
 tokenizer:
diff --git a/recipes/configs/llama2/7B_lora_dpo_single_device.yaml b/recipes/configs/llama2/7B_lora_dpo_single_device.yaml
@@ -24,6 +24,7 @@ model:
   lora_rank: 8
   lora_alpha: 16
   lora_dropout: 0.0
+  lora_dropout: 0.0
 
 # Tokenizer
 tokenizer:
diff --git a/recipes/configs/llama2/7B_lora_single_device.yaml b/recipes/configs/llama2/7B_lora_single_device.yaml
@@ -24,6 +24,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 tokenizer:
   _component_: torchtune.models.llama2.llama2_tokenizer
diff --git a/recipes/configs/llama2/7B_qlora.yaml b/recipes/configs/llama2/7B_qlora.yaml
@@ -25,6 +25,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 tokenizer:
   _component_: torchtune.models.llama2.llama2_tokenizer
diff --git a/recipes/configs/llama2/7B_qlora_single_device.yaml b/recipes/configs/llama2/7B_qlora_single_device.yaml
@@ -23,6 +23,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 tokenizer:
   _component_: torchtune.models.llama2.llama2_tokenizer
diff --git a/recipes/configs/llama3/70B_lora.yaml b/recipes/configs/llama3/70B_lora.yaml
@@ -17,6 +17,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 16
   lora_alpha: 32
+  lora_dropout: 0.0
 
 tokenizer:
   _component_: torchtune.models.llama3.llama3_tokenizer
diff --git a/recipes/configs/llama3/8B_lora.yaml b/recipes/configs/llama3/8B_lora.yaml
@@ -31,6 +31,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 checkpointer:
   _component_: torchtune.training.FullModelMetaCheckpointer
diff --git a/recipes/configs/llama3/8B_lora_single_device.yaml b/recipes/configs/llama3/8B_lora_single_device.yaml
@@ -24,6 +24,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 # Tokenizer
 tokenizer:
diff --git a/recipes/configs/llama3/8B_qlora_single_device.yaml b/recipes/configs/llama3/8B_qlora_single_device.yaml
@@ -23,6 +23,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 # Tokenizer
 tokenizer:
diff --git a/recipes/configs/llama3_1/70B_lora.yaml b/recipes/configs/llama3_1/70B_lora.yaml
@@ -16,6 +16,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 16
   lora_alpha: 32
+  lora_dropout: 0.0
 
 tokenizer:
   _component_: torchtune.models.llama3.llama3_tokenizer
diff --git a/recipes/configs/llama3_1/8B_lora.yaml b/recipes/configs/llama3_1/8B_lora.yaml
@@ -31,6 +31,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
diff --git a/recipes/configs/llama3_1/8B_lora_single_device.yaml b/recipes/configs/llama3_1/8B_lora_single_device.yaml
@@ -24,6 +24,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 # Tokenizer
 tokenizer:
diff --git a/recipes/configs/llama3_1/8B_qlora_single_device.yaml b/recipes/configs/llama3_1/8B_qlora_single_device.yaml
@@ -23,6 +23,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 # Tokenizer
 tokenizer:
diff --git a/recipes/configs/mistral/7B_lora.yaml b/recipes/configs/mistral/7B_lora.yaml
@@ -42,6 +42,7 @@ model:
   apply_lora_to_output: True
   lora_rank: 64
   lora_alpha: 16
+  lora_dropout: 0.0
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
diff --git a/recipes/configs/mistral/7B_lora_single_device.yaml b/recipes/configs/mistral/7B_lora_single_device.yaml
@@ -39,6 +39,7 @@ model:
   apply_lora_to_output: True
   lora_rank: 64
   lora_alpha: 16
+  lora_dropout: 0.0
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
diff --git a/recipes/configs/mistral/7B_qlora_single_device.yaml b/recipes/configs/mistral/7B_qlora_single_device.yaml
@@ -40,6 +40,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 64
   lora_alpha: 16
+  lora_dropout: 0.0
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
diff --git a/recipes/configs/phi3/mini_lora.yaml b/recipes/configs/phi3/mini_lora.yaml
@@ -25,6 +25,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 # Tokenizer
 tokenizer:
diff --git a/recipes/configs/phi3/mini_lora_single_device.yaml b/recipes/configs/phi3/mini_lora_single_device.yaml
@@ -23,6 +23,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 # Tokenizer
 tokenizer:
diff --git a/recipes/configs/phi3/mini_qlora_single_device.yaml b/recipes/configs/phi3/mini_qlora_single_device.yaml
@@ -23,6 +23,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 # Tokenizer
 tokenizer:
diff --git a/recipes/configs/qwen2/0.5B_lora.yaml b/recipes/configs/qwen2/0.5B_lora.yaml
@@ -25,6 +25,7 @@ model:
   apply_lora_to_mlp: False
   lora_rank: 32
   lora_alpha: 64
+  lora_dropout: 0.0
 
 tokenizer:
   _component_: torchtune.models.qwen2.qwen2_tokenizer
diff --git a/recipes/configs/qwen2/0.5B_lora_single_device.yaml b/recipes/configs/qwen2/0.5B_lora_single_device.yaml
@@ -23,6 +23,7 @@ model:
   apply_lora_to_mlp: False
   lora_rank: 32
   lora_alpha: 64
+  lora_dropout: 0.0
 
 tokenizer:
   _component_: torchtune.models.qwen2.qwen2_tokenizer
diff --git a/recipes/configs/qwen2/1.5B_lora.yaml b/recipes/configs/qwen2/1.5B_lora.yaml
@@ -23,6 +23,7 @@ model:
   apply_lora_to_mlp: False
   lora_rank: 32
   lora_alpha: 64
+  lora_dropout: 0.0
 
 tokenizer:
   _component_: torchtune.models.qwen2.qwen2_tokenizer
diff --git a/recipes/configs/qwen2/1.5B_lora_single_device.yaml b/recipes/configs/qwen2/1.5B_lora_single_device.yaml
@@ -23,6 +23,7 @@ model:
   apply_lora_to_mlp: False
   lora_rank: 32
   lora_alpha: 64
+  lora_dropout: 0.0
 
 tokenizer:
   _component_: torchtune.models.qwen2.qwen2_tokenizer
diff --git a/recipes/configs/qwen2/7B_lora.yaml b/recipes/configs/qwen2/7B_lora.yaml
@@ -26,6 +26,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 tokenizer:
   _component_: torchtune.models.qwen2.qwen2_tokenizer
diff --git a/recipes/configs/qwen2/7B_lora_single_device.yaml b/recipes/configs/qwen2/7B_lora_single_device.yaml
@@ -24,6 +24,7 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
+  lora_dropout: 0.0
 
 tokenizer:
   _component_: torchtune.models.qwen2.qwen2_tokenizer
diff --git a/torchtune/models/code_llama2/_model_builders.py b/torchtune/models/code_llama2/_model_builders.py
@@ -41,7 +41,7 @@ def lora_code_llama2_7b(
     apply_lora_to_output: bool = False,
     lora_rank: int = 8,
     lora_alpha: float = 16,
-    lora_dropout: float = 0.05,
+    lora_dropout: float = 0.0,
     use_dora: bool = False,
     quantize_base: bool = False,
 ) -> TransformerDecoder:
@@ -62,7 +62,7 @@ def lora_code_llama2_7b(
             Default: False
         lora_rank (int): rank of each low-rank approximation
         lora_alpha (float): scaling factor for the low-rank approximation
-        lora_dropout (float): dropout probability for LoRA linear layers. Default: 0.05
+        lora_dropout (float): dropout probability for LoRA linear layers. Default: 0.0
         quantize_base (bool): Whether to quantize base model weights
 
     Returns:
@@ -125,7 +125,7 @@ def lora_code_llama2_13b(
     apply_lora_to_output: bool = False,
     lora_rank: int = 8,
     lora_alpha: float = 16,
-    lora_dropout: float = 0.05,
+    lora_dropout: float = 0.0,
     use_dora: bool = False,
     quantize_base: bool = False,
 ) -> TransformerDecoder:
@@ -212,7 +212,7 @@ def lora_code_llama2_70b(
     apply_lora_to_output: bool = False,
     lora_rank: int = 8,
     lora_alpha: float = 16,
-    lora_dropout: float = 0.05,
+    lora_dropout: float = 0.0,
     use_dora: bool = False,
     quantize_base: bool = False,
 ) -> TransformerDecoder:
@@ -233,6 +233,7 @@ def lora_code_llama2_70b(
             Default: False
         lora_rank (int): rank of each low-rank approximation
         lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): LoRA dropout probability. Default: 0.0
         use_dora (bool): Decompose the LoRA weight into magnitude and direction, as
             introduced in "DoRA: Weight-Decomposed Low-Rank Adaptation" (https://arxiv.org/abs/2402.09353).
         quantize_base (bool): Whether to quantize base model weights
diff --git a/torchtune/models/gemma/_model_builders.py b/torchtune/models/gemma/_model_builders.py
@@ -68,6 +68,7 @@ def lora_gemma_2b(
     apply_lora_to_mlp: bool = False,
     lora_rank: int = 8,
     lora_alpha: float = 16,
+    lora_dropout: float = 0.0,
     use_dora: bool = False,
     quantize_base: bool = False,
 ) -> GemmaTransformerDecoder:
@@ -86,6 +87,7 @@ def lora_gemma_2b(
             Default: False
         lora_rank (int): rank of each low-rank approximation
         lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0
         use_dora (bool): Decompose the LoRA weight into magnitude and direction, as
             introduced in "DoRA: Weight-Decomposed Low-Rank Adaptation" (https://arxiv.org/abs/2402.09353).
         quantize_base (bool): Whether to quantize base model weights
@@ -108,7 +110,7 @@ def lora_gemma_2b(
         norm_eps=1e-6,
         lora_rank=lora_rank,
         lora_alpha=lora_alpha,
-        lora_dropout=0.05,
+        lora_dropout=lora_dropout,
         use_dora=use_dora,
         quantize_base=quantize_base,
     )
@@ -150,6 +152,7 @@ def lora_gemma_7b(
     apply_lora_to_mlp: bool = False,
     lora_rank: int = 8,
     lora_alpha: float = 16,
+    lora_dropout: float = 0.0,
     use_dora: bool = False,
     quantize_base: bool = False,
 ) -> GemmaTransformerDecoder:
@@ -168,6 +171,7 @@ def lora_gemma_7b(
             Default: False
         lora_rank (int): rank of each low-rank approximation
         lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0
         use_dora (bool): Decompose the LoRA weight into magnitude and direction, as
             introduced in "DoRA: Weight-Decomposed Low-Rank Adaptation" (https://arxiv.org/abs/2402.09353).
         quantize_base (bool): Whether to quantize base model weights
@@ -190,7 +194,7 @@ def lora_gemma_7b(
         norm_eps=1e-6,
         lora_rank=lora_rank,
         lora_alpha=lora_alpha,
-        lora_dropout=0.05,
+        lora_dropout=lora_dropout,
         use_dora=use_dora,
         quantize_base=quantize_base,
     )
diff --git a/torchtune/models/llama2/_model_builders.py b/torchtune/models/llama2/_model_builders.py
diff --git a/torchtune/models/llama3/_model_builders.py b/torchtune/models/llama3/_model_builders.py
diff --git a/torchtune/models/llama3_1/_model_builders.py b/torchtune/models/llama3_1/_model_builders.py
diff --git a/torchtune/models/mistral/_model_builders.py b/torchtune/models/mistral/_model_builders.py
diff --git a/torchtune/models/phi3/_model_builders.py b/torchtune/models/phi3/_model_builders.py
diff --git a/torchtune/models/qwen2/_model_builders.py b/torchtune/models/qwen2/_model_builders.py
diff --git a/torchtune/modules/peft/dora.py b/torchtune/modules/peft/dora.py
diff --git a/torchtune/modules/peft/lora.py b/torchtune/modules/peft/lora.py