Disable prefix tuning as its currently not supported; Limit llama_adapter usage to non-FSDP only

mreso · mreso · commit 091d58df1767 · 2024-05-01T20:07:17.000-07:00
diff --git a/recipes/finetuning/README.md b/recipes/finetuning/README.md
@@ -70,7 +70,7 @@ It lets us specify the training settings for everything from `model_name` to `da
 
 * [Datasets config file](../../src/llama_recipes/configs/datasets.py) provides the available options for datasets.
 
-* [peft config file](../../src/llama_recipes/configs/peft.py) provides the supported PEFT methods and respective settings that can be modified.
+* [peft config file](../../src/llama_recipes/configs/peft.py) provides the supported PEFT methods and respective settings that can be modified. We currently support LoRA and LLaMA-Adapter. Please note that LoRA is the only technique which is supported in combination with FSDP.
 
 * [FSDP config file](../../src/llama_recipes/configs/fsdp.py) provides FSDP settings such as:
 
diff --git a/src/llama_recipes/configs/peft.py b/src/llama_recipes/configs/peft.py
@@ -20,7 +20,8 @@ class llama_adapter_config:
      adapter_layers: int= 30
      task_type: str= "CAUSAL_LM"
 
+#CAUTION prefix tuning is currently not supported
 @dataclass
 class prefix_config:
      num_virtual_tokens: int=30
-     task_type: str= "CAUSAL_LM"    
+     task_type: str= "CAUSAL_LM"
diff --git a/src/llama_recipes/configs/training.py b/src/llama_recipes/configs/training.py
@@ -29,7 +29,7 @@ class train_config:
     mixed_precision: bool=True
     val_batch_size: int=1
     dataset = "samsum_dataset"
-    peft_method: str = "lora" # None,llama_adapter, prefix
+    peft_method: str = "lora" # None, llama_adapter (Caution: llama_adapter is currently not supported with FSDP)
     use_peft: bool=False
     output_dir: str = "PATH/to/save/PEFT/model"
     freeze_layers: bool = False
diff --git a/src/llama_recipes/utils/config_utils.py b/src/llama_recipes/utils/config_utils.py
@@ -45,7 +45,17 @@ def generate_peft_config(train_config, kwargs):
     peft_configs = (LoraConfig, AdaptionPromptConfig, PrefixTuningConfig)
     names = tuple(c.__name__.rstrip("_config") for c in configs)
 
-    assert train_config.peft_method in names, f"Peft config not found: {train_config.peft_method}"
+    assert (
+        train_config.peft_method in names
+    ), f"Peft config not found: {train_config.peft_method}"
+
+    assert (
+        train_config.peft_method != "prefix"
+    ), "PrefixTuning is currently not supported (see https://github.com/meta-llama/llama-recipes/issues/359#issuecomment-2089350811)"
+    if train_config.enable_fsdp:
+        assert (
+            train_config.peft_method != "llama_adapter"
+        ), "Llama_adapter is currently not supported in combination with FSDP (see https://github.com/meta-llama/llama-recipes/issues/359#issuecomment-2089274425)"
 
     config = configs[names.index(train_config.peft_method)]()
 
diff --git a/src/llama_recipes/utils/fsdp_utils.py b/src/llama_recipes/utils/fsdp_utils.py
@@ -8,8 +8,6 @@ def fsdp_auto_wrap_policy(model, transformer_layer_name):
 
     from torch.distributed.fsdp.wrap import _or_policy, lambda_auto_wrap_policy, transformer_auto_wrap_policy
 
-    from peft.tuners import PrefixEncoder, PromptEmbedding, PromptEncoder
-
     def lambda_policy_fn(module):
         if (
             len(list(module.named_children())) == 0
@@ -23,13 +21,7 @@ def lambda_policy_fn(module):
     transformer_wrap_policy = functools.partial(
         transformer_auto_wrap_policy,
         transformer_layer_cls=(
-            PrefixEncoder,
-            PromptEncoder,
-            PromptEmbedding,
             transformer_layer_name,
-            # FullyShardedDataParallelPlugin.get_module_class_from_name(
-            #     model, transformer_layer_name
-            # ),
         ),
     )