diff --git a/src/peft/tuners/boft/layer.py b/src/peft/tuners/boft/layer.py index 7232f39d17..470ce12312 100644 --- a/src/peft/tuners/boft/layer.py +++ b/src/peft/tuners/boft/layer.py @@ -457,10 +457,10 @@ def cayley_batch(self, data): skew_mat = 0.5 * (data - data.transpose(1, 2)) id_mat = torch.eye(r, device=data.device).unsqueeze(0).expand(b, r, c) - # Perform the Cayley parametrization + # Perform the Cayley parametrization, must be in float32 Q = torch.linalg.solve(id_mat + skew_mat, id_mat - skew_mat, left=False) - return Q + return Q.to(data.dtype) class Linear(nn.Module, BOFTLayer): @@ -586,7 +586,7 @@ def get_delta_weight(self, adapter) -> tuple[torch.Tensor, torch.Tensor]: block_diagonal_butterfly = torch.block_diag(*torch.unbind(orth_rotate_butterfly)) block_diagonal_butterfly = block_diagonal_butterfly.unsqueeze(0) - boft_P = self.boft_P.to(block_diagonal_butterfly.device) + boft_P = self.boft_P.to(block_diagonal_butterfly.device, block_diagonal_butterfly.dtype) butterfly_oft_mat_batch = torch.bmm(block_diagonal_butterfly, boft_P.permute(0, 2, 1)) butterfly_oft_mat_batch = torch.bmm(boft_P, butterfly_oft_mat_batch) butterfly_oft_mat = butterfly_oft_mat_batch[0] @@ -919,7 +919,7 @@ def get_delta_weight(self, adapter) -> tuple[torch.Tensor, torch.Tensor]: block_diagonal_butterfly = torch.block_diag(*torch.unbind(orth_rotate_butterfly)) block_diagonal_butterfly = block_diagonal_butterfly.unsqueeze(0) - boft_P = self.boft_P.to(block_diagonal_butterfly.device) + boft_P = self.boft_P.to(block_diagonal_butterfly.device, block_diagonal_butterfly.dtype) butterfly_oft_mat_batch = torch.bmm(block_diagonal_butterfly, boft_P.permute(0, 2, 1)) butterfly_oft_mat_batch = torch.bmm(boft_P, butterfly_oft_mat_batch) butterfly_oft_mat = butterfly_oft_mat_batch[0] diff --git a/src/peft/tuners/c3a/model.py b/src/peft/tuners/c3a/model.py index 6e71973691..19c9ef763e 100644 --- a/src/peft/tuners/c3a/model.py +++ b/src/peft/tuners/c3a/model.py @@ -93,5 +93,9 @@ def _create_new_module(c3a_config, adapter_name, target, **kwargs): if isinstance(target_base_layer, torch.nn.Linear): new_module = C3ALinear(target, adapter_name, **kwargs) + else: + raise ValueError( + f"Target module {target} is not supported. Currently, only `torch.nn.Linear` is supported." + ) return new_module diff --git a/src/peft/tuners/fourierft/layer.py b/src/peft/tuners/fourierft/layer.py index a03a57f118..f95a414db9 100644 --- a/src/peft/tuners/fourierft/layer.py +++ b/src/peft/tuners/fourierft/layer.py @@ -21,6 +21,7 @@ from transformers.pytorch_utils import Conv1D from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge +from peft.utils.other import transpose class FourierFTLayer(BaseTunerLayer): @@ -139,7 +140,7 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N # Note that safe_merge will be slower than the normal merge # because of the copy operation. orig_weights = base_layer.weight.data.clone() - orig_weights += self.get_delta_weight(active_adapter) + orig_weights += transpose(self.get_delta_weight(active_adapter), self.fan_in_fan_out) if not torch.isfinite(orig_weights).all(): raise ValueError( @@ -148,7 +149,7 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N base_layer.weight.data = orig_weights else: - base_layer.weight.data += self.get_delta_weight(active_adapter) + base_layer.weight.data += transpose(self.get_delta_weight(active_adapter), self.fan_in_fan_out) self.merged_adapters.append(active_adapter) def unmerge(self) -> None: @@ -161,10 +162,9 @@ def unmerge(self) -> None: while len(self.merged_adapters) > 0: active_adapter = self.merged_adapters.pop() if active_adapter in self.fourierft_spectrum.keys(): - self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter) - - def get_delta_weight(self, adapter) -> torch.Tensor: - return super().get_delta_weight(adapter) + self.get_base_layer().weight.data -= transpose( + self.get_delta_weight(active_adapter), self.fan_in_fan_out + ) def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: previous_dtype = x.dtype diff --git a/src/peft/tuners/ln_tuning/layer.py b/src/peft/tuners/ln_tuning/layer.py index e29149f2cb..4000e992a7 100644 --- a/src/peft/tuners/ln_tuning/layer.py +++ b/src/peft/tuners/ln_tuning/layer.py @@ -105,7 +105,7 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: if self.merged: self.unmerge() result = self.base_layer(x, *args, **kwargs) - elif self.merged: + elif self.merged or (len(self.active_adapters) == 0): result = self.base_layer(x, *args, **kwargs) else: if len(self.active_adapters) != 1: diff --git a/src/peft/tuners/waveft/layer.py b/src/peft/tuners/waveft/layer.py index a17f3ffba3..c5030e4a16 100644 --- a/src/peft/tuners/waveft/layer.py +++ b/src/peft/tuners/waveft/layer.py @@ -21,6 +21,7 @@ from transformers.pytorch_utils import Conv1D from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge +from peft.utils.other import transpose from .constants import WAVELET_REDUCTIONS from .waverec2d import waverec2d @@ -237,7 +238,7 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N # Note that safe_merge will be slower than the normal merge # because of the copy operation. orig_weights = base_layer.weight.data.clone() - orig_weights += self.get_delta_weight(active_adapter) + orig_weights += transpose(self.get_delta_weight(active_adapter), self.fan_in_fan_out) if not torch.isfinite(orig_weights).all(): raise ValueError( @@ -246,7 +247,7 @@ def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = N base_layer.weight.data = orig_weights else: - base_layer.weight.data += self.get_delta_weight(active_adapter) + base_layer.weight.data += transpose(self.get_delta_weight(active_adapter), self.fan_in_fan_out) self.merged_adapters.append(active_adapter) def unmerge(self) -> None: @@ -259,10 +260,9 @@ def unmerge(self) -> None: while len(self.merged_adapters) > 0: active_adapter = self.merged_adapters.pop() if active_adapter in self.waveft_spectrum.keys(): - self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter) - - def get_delta_weight(self, adapter) -> torch.Tensor: - return super().get_delta_weight(adapter) + self.get_base_layer().weight.data -= transpose( + self.get_delta_weight(active_adapter), self.fan_in_fan_out + ) def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: previous_dtype = x.dtype diff --git a/tests/test_decoder_models.py b/tests/test_decoder_models.py index 06402d637b..1b1d3631bb 100644 --- a/tests/test_decoder_models.py +++ b/tests/test_decoder_models.py @@ -327,10 +327,6 @@ def _skip_alora_no_activation(config_cls, config_kwargs): class TestDecoderModels(PeftCommonTester): transformers_class = AutoModelForCausalLM - def skipTest(self, reason=""): - # for backwards compatibility with unittest style test classes - pytest.skip(reason) - def prepare_inputs_for_testing(self): input_ids = torch.tensor([[1, 1, 1], [1, 2, 1]]).to(self.torch_device) attention_mask = torch.tensor([[1, 1, 1], [1, 0, 1]]).to(self.torch_device) diff --git a/tests/test_encoder_decoder_models.py b/tests/test_encoder_decoder_models.py index 1ec0aa0668..038ca22698 100644 --- a/tests/test_encoder_decoder_models.py +++ b/tests/test_encoder_decoder_models.py @@ -228,10 +228,6 @@ class TestEncoderDecoderModels(PeftCommonTester): transformers_class = AutoModelForSeq2SeqLM - def skipTest(self, reason=""): - # for backwards compatibility with unittest style test classes - pytest.skip(reason) - def prepare_inputs_for_testing(self): input_ids = torch.tensor([[1, 1, 1], [1, 2, 1]]).to(self.torch_device) decoder_input_ids = torch.tensor([[1, 1, 1], [1, 2, 1]]).to(self.torch_device) diff --git a/tests/test_feature_extraction_models.py b/tests/test_feature_extraction_models.py index a5377827f4..c6054782ba 100644 --- a/tests/test_feature_extraction_models.py +++ b/tests/test_feature_extraction_models.py @@ -258,10 +258,6 @@ class TestPeftFeatureExtractionModel(PeftCommonTester): transformers_class = AutoModel - def skipTest(self, reason=""): - # for backwards compatibility with unittest style test classes - pytest.skip(reason) - def prepare_inputs_for_testing(self): input_ids = torch.tensor([[1, 1, 1], [1, 2, 1]]).to(self.torch_device) attention_mask = torch.tensor([[1, 1, 1], [1, 0, 1]]).to(self.torch_device) diff --git a/tests/test_gpu_examples.py b/tests/test_gpu_examples.py index 909f456aa7..00b9d05ceb 100644 --- a/tests/test_gpu_examples.py +++ b/tests/test_gpu_examples.py @@ -4558,7 +4558,7 @@ def _test_model(self, model, precision): input_ids = torch.randint(0, 1000, (2, 10)).to(self.device) if precision == torch.bfloat16: if not is_bf16_available(): - self.skipTest("Bfloat16 not supported on this device") + pytest.skip("Bfloat16 not supported on this device") # Forward pass with test precision with torch.autocast(enabled=True, dtype=precision, device_type=self.device): diff --git a/tests/test_mixed.py b/tests/test_mixed.py index 7ec18387c8..d7b663c182 100644 --- a/tests/test_mixed.py +++ b/tests/test_mixed.py @@ -526,7 +526,7 @@ def test_target_first_layer_same_type(self, config0, config1): def test_deeply_nested(self): # a somewhat absurdly nested model using different adapter types if platform.system() == "Linux": - self.skipTest("This test fails but only on GitHub CI with Linux systems.") + pytest.skip("This test fails but only on GitHub CI with Linux systems.") atol = 1e-5 rtol = 1e-5 diff --git a/tests/test_seq_classifier.py b/tests/test_seq_classifier.py index 03869c3a7a..b6a0ff28e6 100644 --- a/tests/test_seq_classifier.py +++ b/tests/test_seq_classifier.py @@ -234,10 +234,6 @@ class TestSequenceClassificationModels(PeftCommonTester): transformers_class = AutoModelForSequenceClassification - def skipTest(self, reason=""): - # for backwards compatibility with unittest style test classes - pytest.skip(reason) - def prepare_inputs_for_testing(self): input_ids = torch.tensor([[1, 1, 1], [1, 2, 1]]).to(self.torch_device) attention_mask = torch.tensor([[1, 1, 1], [1, 0, 1]]).to(self.torch_device) diff --git a/tests/test_target_parameters.py b/tests/test_target_parameters.py index adffbce0d5..64297daf3c 100644 --- a/tests/test_target_parameters.py +++ b/tests/test_target_parameters.py @@ -169,10 +169,6 @@ class TestDecoderModelsTargetParameters(PeftCommonTester): # generally, nothing is broken. transformers_class = MyAutoModelForCausalLM - def skipTest(self, reason=""): - # for backwards compatibility with unittest style test classes - pytest.skip(reason) - def prepare_inputs_for_testing(self): input_ids = torch.tensor([[1, 1, 1], [1, 2, 1]]).to(self.torch_device) attention_mask = torch.tensor([[1, 1, 1], [1, 0, 1]]).to(self.torch_device) diff --git a/tests/testing_common.py b/tests/testing_common.py index dab9ee6e45..75594f15f6 100644 --- a/tests/testing_common.py +++ b/tests/testing_common.py @@ -33,27 +33,19 @@ from peft import ( AdaLoraConfig, BOFTConfig, - BoneConfig, CPTConfig, - DeloraConfig, - FourierFTConfig, - HRAConfig, IA3Config, LNTuningConfig, LoHaConfig, LoKrConfig, LoraConfig, - MissConfig, - OFTConfig, PeftModel, - PeftType, PrefixTuningConfig, PromptEncoderConfig, PromptLearningConfig, PromptTuningConfig, - RandLoraConfig, + RoadConfig, VBLoRAConfig, - VeraConfig, get_peft_model, get_peft_model_state_dict, inject_adapter_in_model, @@ -73,131 +65,29 @@ from .testing_utils import get_state_dict, hub_online_once -CONFIG_TESTING_KWARGS = ( - # IA³ - { - "target_modules": None, - "feedforward_modules": None, - }, - # LoRA - { - "r": 8, - "lora_alpha": 32, - "target_modules": None, - "lora_dropout": 0.05, - "bias": "none", - }, - # prefix tuning - { - "num_virtual_tokens": 10, - }, - # prompt encoder - { - "num_virtual_tokens": 10, - "encoder_hidden_size": 32, - }, - # prompt tuning - { - "num_virtual_tokens": 10, - }, - # AdaLoRA - { - "target_modules": None, - "total_step": 1, - }, - # BOFT - { - "target_modules": None, - }, - # VeRA - { - "r": 8, - "target_modules": None, - "vera_dropout": 0.05, - "projection_prng_key": 0xFF, - "d_initial": 0.1, - "save_projection": True, - "bias": "none", - }, - # FourierFT - { - "n_frequency": 10, - "target_modules": None, - }, - # HRA - { - "target_modules": None, - }, - # VBLoRA - {"target_modules": None, "vblora_dropout": 0.05, "vector_length": 1, "num_vectors": 2}, - # OFT - { - "target_modules": None, - }, - # Bone - { - "target_modules": None, - "r": 2, - }, - # MiSS - { - "target_modules": None, - "r": 2, - }, - # LoRA + trainable_tokens - { - "r": 8, - "lora_alpha": 32, - "target_modules": None, - "lora_dropout": 0.05, - "bias": "none", - "trainable_token_indices": [0, 1, 3], - }, - # RandLoRA - { - "r": 32, - "randlora_alpha": 64, - "target_modules": None, - "randlora_dropout": 0.05, - "projection_prng_key": 0xFF, - "save_projection": True, - "bias": "none", - }, - # CPT tuninig - { - "cpt_token_ids": [0, 1, 2, 3, 4, 5, 6, 7], # Example token IDs for testing - "cpt_mask": [1, 1, 1, 1, 1, 1, 1, 1], - "cpt_tokens_type_mask": [1, 2, 2, 2, 3, 3, 4, 4], - }, - # DeLoRA - { - "r": 8, - "target_modules": None, - "bias": "none", - }, -) +def _skip_if_merging_not_supported(config_cls, config_kwargs): + if issubclass(config_cls, PromptLearningConfig): + pytest.skip("Prompt learning does not support merging, skipping this test.") + if config_kwargs.get("alora_invocation_tokens") is not None: + pytest.skip("Test not applicable for Activated LoRA") + + +def _skip_if_adding_weighted_adapters_not_supported(config): + if not isinstance(config, (IA3Config, LoraConfig)): + pytest.skip("This PEFT method does not support adding weighted adapters, skipping this test.") + + +def _skip_if_deleting_adapter_not_supported(config_cls, config_kwargs): + if issubclass(config_cls, PromptLearningConfig): + pytest.skip("Prompt learning does not support deletion of adapters, skipping this test.") + -CLASSES_MAPPING = { - "ia3": (IA3Config, CONFIG_TESTING_KWARGS[0]), - "lora": (LoraConfig, CONFIG_TESTING_KWARGS[1]), - "prefix_tuning": (PrefixTuningConfig, CONFIG_TESTING_KWARGS[2]), - "prompt_encoder": (PromptEncoderConfig, CONFIG_TESTING_KWARGS[3]), - "prompt_tuning": (PromptTuningConfig, CONFIG_TESTING_KWARGS[4]), - "adalora": (AdaLoraConfig, CONFIG_TESTING_KWARGS[5]), - "boft": (BOFTConfig, CONFIG_TESTING_KWARGS[6]), - "vera": (VeraConfig, CONFIG_TESTING_KWARGS[7]), - "fourierft": (FourierFTConfig, CONFIG_TESTING_KWARGS[8]), - "hra": (HRAConfig, CONFIG_TESTING_KWARGS[9]), - "vblora": (VBLoRAConfig, CONFIG_TESTING_KWARGS[10]), - "oft": (OFTConfig, CONFIG_TESTING_KWARGS[11]), - "bone": (BoneConfig, CONFIG_TESTING_KWARGS[12]), - "miss": (MissConfig, CONFIG_TESTING_KWARGS[12]), - "lora+trainable_tokens": (LoraConfig, CONFIG_TESTING_KWARGS[13]), - "randlora": (RandLoraConfig, CONFIG_TESTING_KWARGS[14]), - "delora": (DeloraConfig, CONFIG_TESTING_KWARGS[17]), -} - -DECODER_MODELS_EXTRA = {"cpt": (CPTConfig, CONFIG_TESTING_KWARGS[15])} +def _skip_if_conv1d_not_supported(model_id, config_cls, config_kwargs): + if "gpt2" not in model_id.lower(): + return + + if config_cls not in (IA3Config, LoHaConfig, LoKrConfig, LoraConfig): + pytest.skip("This PEFT method does not support Conv1D layers, skipping this test.") class PeftCommonTester: @@ -305,7 +195,7 @@ def _test_adapter_name(self, model_id, config_cls, config_kwargs): def _test_prepare_for_training(self, model_id, config_cls, config_kwargs): if config_kwargs.get("trainable_token_indices", None) is not None: # incompatible because trainable tokens is marking embeddings as trainable - self.skipTest("Trainable tokens is incompatible with this test.") + pytest.skip("Trainable tokens is incompatible with this test.") # some tests require specific tokenizers, make sure that they can be fetched as well with hub_online_once(model_id + config_kwargs.get("tokenizer_name_or_path", "")): @@ -460,7 +350,7 @@ def _test_save_pretrained(self, model_id, config_cls, config_kwargs, safe_serial def _test_save_pretrained_selected_adapters(self, model_id, config_cls, config_kwargs, safe_serialization=True): if issubclass(config_cls, AdaLoraConfig): # AdaLora does not support adding more than 1 adapter - return pytest.skip(f"Test not applicable for {config_cls}") + pytest.skip(f"Test not applicable for {config_cls}") # ensure that the weights are randomly initialized if issubclass(config_cls, LoraConfig): @@ -595,20 +485,10 @@ def _test_load_multiple_adapters(self, model_id, config_cls, config_kwargs): assert load_result2.missing_keys == [] def _test_merge_layers_fp16(self, model_id, config_cls, config_kwargs): - if ( - config_cls not in (LoraConfig, IA3Config, AdaLoraConfig, LoHaConfig, LoKrConfig, VBLoRAConfig) - or config_kwargs.get("alora_invocation_tokens") is not None - ): - # Merge layers only supported for LoRA and IA³, and not for Activated LoRA (aLoRA) - if config_kwargs.get("alora_invocation_tokens") is None: - return pytest.skip(f"Test not applicable for {config_cls}") - else: - return pytest.skip("Test not applicable for Activated LoRA") - if ("gpt2" in model_id.lower()) and (config_cls != LoraConfig): - self.skipTest("Merging GPT2 adapters not supported for IA³ (yet)") - + _skip_if_merging_not_supported(config_cls, config_kwargs) + _skip_if_conv1d_not_supported(model_id, config_cls, config_kwargs) if (self.torch_device in ["cpu"]) and (version.parse(torch.__version__) <= version.parse("2.1")): - self.skipTest("PyTorch 2.1 not supported for Half of addmm_impl_cpu_ ") + pytest.skip("PyTorch 2.1 not supported for Half of addmm_impl_cpu_ ") with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id, dtype=torch.float16) @@ -625,27 +505,8 @@ def _test_merge_layers_fp16(self, model_id, config_cls, config_kwargs): _ = model.merge_and_unload() def _test_merge_layers_nan(self, model_id, config_cls, config_kwargs): - if ( - config_cls - not in ( - LoraConfig, - IA3Config, - AdaLoraConfig, - LoHaConfig, - LoKrConfig, - VeraConfig, - FourierFTConfig, - ) - or config_kwargs.get("alora_invocation_tokens") is not None - ): - # Merge layers only supported for LoRA and IA³, and not for Activated LoRA (aLoRA) - return - if ("gpt2" in model_id.lower()) and (config_cls != LoraConfig): - self.skipTest("Merging GPT2 adapters not supported for IA³ (yet)") - - if "gemma" in model_id.lower(): - # TODO: could be related to tied weights - self.skipTest("Merging currently fails with gemma") + _skip_if_merging_not_supported(config_cls, config_kwargs) + _skip_if_conv1d_not_supported(model_id, config_cls, config_kwargs) with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id) @@ -680,14 +541,7 @@ def _test_merge_layers_nan(self, model_id, config_cls, config_kwargs): model = model.to(self.torch_device) for name, module in model.named_parameters(): - if ( - "lora_A" in name - or "ia3" in name - or "lora_E" in name - or "lora_B" in name - or "vera_lambda" in name - or "fourierft_spectrum" in name - ): + if model.prefix in name: module.data[0] = torch.nan with pytest.raises( @@ -712,21 +566,8 @@ def _test_merge_layers_nan(self, model_id, config_cls, config_kwargs): model = model.merge_and_unload(safe_merge=True) def _test_merge_layers(self, model_id, config_cls, config_kwargs): - if issubclass(config_cls, PromptLearningConfig): - return pytest.skip(f"Test not applicable for {config_cls}") - - if issubclass(config_cls, (OFTConfig, BOFTConfig)): - return pytest.skip(f"Test not applicable for {config_cls}") - - if config_kwargs.get("alora_invocation_tokens") is not None: - return pytest.skip("Merging not applicable to aLoRA") - - if ("gpt2" in model_id.lower()) and (config_cls != LoraConfig): - self.skipTest("Merging GPT2 adapters not supported for IA³ (yet)") - - if "gemma" in model_id.lower(): - # TODO: could be related to tied weights - self.skipTest("Merging currently fails with gemma") + _skip_if_merging_not_supported(config_cls, config_kwargs) + _skip_if_conv1d_not_supported(model_id, config_cls, config_kwargs) with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id) @@ -756,15 +597,18 @@ def _test_merge_layers(self, model_id, config_cls, config_kwargs): logits_merged_unloaded = model(**dummy_input)[0] conv_ids = ["Conv2d", "Conv3d", "Conv2d2"] + is_decoder = getattr(getattr(model, "config", None), "is_decoder", False) atol, rtol = 1e-4, 1e-4 if self.torch_device in ["mlu"]: atol, rtol = 1e-3, 1e-3 # MLU - if config.peft_type == "ADALORA": - # AdaLoRA is a bit flaky on CI, but this cannot be reproduced locally + if config.peft_type in ("ADALORA", "OFT"): + # these methods require a bit higher tolerance atol, rtol = 1e-2, 1e-2 - if (config.peft_type in {"IA3", "LORA"}) and (model_id in conv_ids): + if (config.peft_type in {"IA3", "LORA", "OFT"}) and (model_id in conv_ids): # for some reason, the Conv introduces a larger error atol, rtol = 0.3, 0.01 + if (config.peft_type == "OFT") and not is_decoder: + atol, rtol = 0.3, 0.01 if model_id == "trl-internal-testing/tiny-Llama4ForCausalLM": # also getting larger errors here, not exactly sure why atol, rtol = 0.3, 0.01 @@ -800,23 +644,12 @@ def _test_merge_layers(self, model_id, config_cls, config_kwargs): assert torch.allclose(logits_merged, logits_merged_from_pretrained, atol=atol, rtol=rtol) def _test_merge_layers_multi(self, model_id, config_cls, config_kwargs): - supported_peft_types = [ - PeftType.LORA, - PeftType.LOHA, - PeftType.LOKR, - PeftType.IA3, - PeftType.OFT, - PeftType.BOFT, - PeftType.HRA, - PeftType.BONE, - PeftType.MISS, - ] - - if ("gpt2" in model_id.lower()) and (config_cls == IA3Config): - self.skipTest("Merging GPT2 adapters not supported for IA³ (yet)") - + _skip_if_merging_not_supported(config_cls, config_kwargs) + if issubclass(config_cls, AdaLoraConfig): + # AdaLora does not support adding more than 1 adapter + pytest.skip("AdaLoRA does not support multiple adapters, skipping this test.") if config_kwargs.get("trainable_token_indices", None) is not None: - self.skipTest( + pytest.skip( "Merging two adapters with trainable tokens is tested elsewhere since adapters with " "the same token indices cannot be merged." ) @@ -825,12 +658,14 @@ def _test_merge_layers_multi(self, model_id, config_cls, config_kwargs): base_model_name_or_path=model_id, **config_kwargs, ) - - if config.peft_type not in supported_peft_types or config_kwargs.get("alora_invocation_tokens") is not None: - return + if config_cls == VBLoRAConfig: + # for VBLoRA, increase this value or else the two adapters are too similar + config.init_logits_std *= 100 + config.init_vector_bank_bound *= 100 with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id) + torch.manual_seed(0) model = get_peft_model(model, config) model = model.to(self.torch_device) @@ -840,6 +675,7 @@ def _test_merge_layers_multi(self, model_id, config_cls, config_kwargs): with torch.inference_mode(): logits_adapter_1 = model(**dummy_input)[0] + torch.manual_seed(1) model.add_adapter("adapter-2", config) model.set_adapter("adapter-2") model.eval() @@ -888,9 +724,9 @@ def _test_merge_layers_multi(self, model_id, config_cls, config_kwargs): assert torch.allclose(logits_merged_adapter_default, logits_adapter_1, atol=1e-3, rtol=1e-3) def _test_merge_layers_is_idempotent(self, model_id, config_cls, config_kwargs): - if config_kwargs.get("alora_invocation_tokens") is not None: - # Merging not supported for Activated LoRA (aLoRA) - return pytest.skip("Test not applicable for Activated LoRA (aLoRA)") + _skip_if_merging_not_supported(config_cls, config_kwargs) + _skip_if_conv1d_not_supported(model_id, config_cls, config_kwargs) + with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id) config = config_cls( @@ -913,9 +749,8 @@ def _test_merge_layers_is_idempotent(self, model_id, config_cls, config_kwargs): assert torch.allclose(logits_0, logits_1, atol=1e-6, rtol=1e-6) def _test_safe_merge(self, model_id, config_cls, config_kwargs): - if config_kwargs.get("alora_invocation_tokens") is not None: - # Merging not supported for Activated LoRA (aLoRA) - return pytest.skip("Test not applicable for Activated LoRA (aLoRA)") + _skip_if_merging_not_supported(config_cls, config_kwargs) + torch.manual_seed(0) with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id) @@ -960,8 +795,8 @@ def _test_safe_merge(self, model_id, config_cls, config_kwargs): def _test_mixed_adapter_batches(self, model_id, config_cls, config_kwargs): # Test for mixing different adapters in a single batch by passing the adapter_names argument - if config_cls not in (LoraConfig,): - return pytest.skip(f"Mixed adapter batches not supported for {config_cls}") + if config_cls not in (LoraConfig, RoadConfig): + pytest.skip(f"Mixed adapter batches not supported for {config_cls}") config = config_cls( base_model_name_or_path=model_id, @@ -1025,13 +860,14 @@ def _test_generate_with_mixed_adapter_batches_and_beam_search(self, model_id, co # Test generating with beam search and with mixing different adapters in a single batch by passing the # adapter_names argument. See #2283. if config_cls not in (LoraConfig,): - return pytest.skip(f"Mixed adapter batches not supported for {config_cls}") + # note: RoAD supports mixed adapter batches but not beam search + pytest.skip(f"Mixed adapter batches not supported for {config_cls}") if config_kwargs.get("alora_invocation_tokens") is not None: - return pytest.skip("Beam search not yet supported for aLoRA") # beam search not yet fully supported + pytest.skip("Beam search not yet supported for aLoRA") # beam search not yet fully supported if config_kwargs.get("trainable_token_indices", None) is not None: # for some configurations this test will fail since the adapter values don't differ. # this is probably a problem with the test setup and not with the implementation. - return pytest.skip("Trainable token indices is not supported here (yet).") + pytest.skip("Trainable token indices is not supported here (yet).") config = config_cls( base_model_name_or_path=model_id, @@ -1143,11 +979,7 @@ def _test_generate_pos_args(self, model_id, config_cls, config_kwargs, raises_er _ = model.generate(inputs["input_ids"]) def _test_generate_half_prec(self, model_id, config_cls, config_kwargs): - if config_cls not in (IA3Config, LoraConfig, PrefixTuningConfig): - return pytest.skip(f"Test not applicable for {config_cls}") - - if self.torch_device == "mps": # BFloat16 is not supported on MPS - return pytest.skip("BFloat16 is not supported on MPS") + _skip_if_conv1d_not_supported(model_id, config_cls, config_kwargs) with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id, dtype=torch.bfloat16) @@ -1166,7 +998,7 @@ def _test_generate_half_prec(self, model_id, config_cls, config_kwargs): def _test_prefix_tuning_half_prec_conversion(self, model_id, config_cls, config_kwargs): if config_cls not in (PrefixTuningConfig,): - return pytest.skip(f"Test not applicable for {config_cls}") + pytest.skip(f"Test not applicable for {config_cls}") config = config_cls( base_model_name_or_path=model_id, @@ -1182,10 +1014,10 @@ def _test_prefix_tuning_half_prec_conversion(self, model_id, config_cls, config_ def _test_training(self, model_id, config_cls, config_kwargs): if issubclass(config_cls, PromptLearningConfig): - return pytest.skip(f"Test not applicable for {config_cls}") + pytest.skip("Prompt learning does not support merging, skipping this test.") if (config_cls == AdaLoraConfig) and ("roberta" in model_id.lower()): # TODO: no gradients on the "dense" layer, other layers work, not sure why - self.skipTest("AdaLora with RoBERTa does not work correctly") + pytest.skip("AdaLora with RoBERTa does not work correctly") with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id) @@ -1202,18 +1034,20 @@ def _test_training(self, model_id, config_cls, config_kwargs): output = model(**inputs)[0] loss = output.sum() loss.backward() - parameter_prefix = model.prefix - for n, param in model.named_parameters(): - if (parameter_prefix in n) or ("modules_to_save" in n) or ("token_adapter.trainable_tokens" in n): - assert param.grad is not None - else: - assert param.grad is None + + parameter_prefix = getattr(model, "prefix", None) + if parameter_prefix is not None: # can only check PEFT methods that allow to identify PEFT params + for n, param in model.named_parameters(): + if (parameter_prefix in n) or ("modules_to_save" in n) or ("token_adapter.trainable_tokens" in n): + assert param.grad is not None + else: + assert param.grad is None def _test_inference_safetensors(self, model_id, config_cls, config_kwargs): if (config_cls == PrefixTuningConfig) and ("deberta" in model_id.lower()): # TODO: raises an error: # TypeError: DebertaModel.forward() got an unexpected keyword argument 'past_key_values' - self.skipTest("DeBERTa with PrefixTuning does not work correctly") + pytest.skip("DeBERTa with PrefixTuning does not work correctly") config = config_cls( base_model_name_or_path=model_id, @@ -1251,14 +1085,16 @@ def _test_inference_safetensors(self, model_id, config_cls, config_kwargs): assert torch.allclose(logits, logits_from_pretrained, atol=1e-4, rtol=1e-4) def _test_training_layer_indexing(self, model_id, config_cls, config_kwargs): - if config_cls not in (LoraConfig,): - return pytest.skip(f"Test not applicable for {config_cls}") + try: + config = config_cls( + base_model_name_or_path=model_id, + layers_to_transform=[0], + **config_kwargs, + ) + except TypeError: + pytest.skip("This PEFT method does not support layers_to_transform, skipping it.") + _skip_if_conv1d_not_supported(model_id, config_cls, config_kwargs) - config = config_cls( - base_model_name_or_path=model_id, - layers_to_transform=[0], - **config_kwargs, - ) with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id) model = get_peft_model(model, config) @@ -1292,7 +1128,11 @@ def _test_training_layer_indexing(self, model_id, config_cls, config_kwargs): ) logits_from_pretrained = model_from_pretrained(**inputs)[0][0] - assert torch.allclose(logits, logits_from_pretrained, atol=1e-4, rtol=1e-4) + if config_cls == VBLoRAConfig: + atol, rtol = 1e-3, 1e-3 + else: + atol, rtol = 1e-4, 1e-4 + assert torch.allclose(logits, logits_from_pretrained, atol=atol, rtol=rtol) # check the nb of trainable params again but without layers_to_transform model = self.transformers_class.from_pretrained(model_id) @@ -1317,21 +1157,16 @@ def _test_training_layer_indexing(self, model_id, config_cls, config_kwargs): def _test_training_gradient_checkpointing(self, model_id, config_cls, config_kwargs): if config_cls == PrefixTuningConfig: - return pytest.skip(f"Test not applicable for {config_cls}") - + pytest.skip("Prefix Tuning does not support gradient checkpointing, skipping this test.") if (config_cls == AdaLoraConfig) and ("roberta" in model_id.lower()): # TODO: no gradients on the "dense" layer, other layers work, not sure why - self.skipTest("AdaLora with RoBERTa does not work correctly") - - if (config_cls == OFTConfig) and ("deberta" in model_id.lower()): - # TODO: no gradients on the "dense" layer, other layers work, not sure why - self.skipTest("OFT with Deberta does not work correctly") + pytest.skip("AdaLora with RoBERTa does not work correctly") with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id) if not getattr(model, "supports_gradient_checkpointing", False): - return pytest.skip(f"Model {model_id} does not support gradient checkpointing") + pytest.skip(f"Model {model_id} does not support gradient checkpointing") model.gradient_checkpointing_enable() @@ -1366,9 +1201,7 @@ def _test_training_gradient_checkpointing(self, model_id, config_cls, config_kwa assert param.grad is None def _test_peft_model_device_map(self, model_id, config_cls, config_kwargs): - if config_cls not in (LoraConfig, VBLoRAConfig): - return pytest.skip(f"Test not applicable for {config_cls}") - + _skip_if_conv1d_not_supported(model_id, config_cls, config_kwargs) config = config_cls( base_model_name_or_path=model_id, **config_kwargs, @@ -1390,7 +1223,7 @@ def _test_peft_model_device_map(self, model_id, config_cls, config_kwargs): def _test_training_prompt_learning_tasks(self, model_id, config_cls, config_kwargs): if not issubclass(config_cls, PromptLearningConfig): - return pytest.skip(f"Test not applicable for {config_cls}") + pytest.skip(f"Test not applicable for {config_cls}") with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id) @@ -1421,28 +1254,14 @@ def _test_training_prompt_learning_tasks(self, model_id, config_cls, config_kwar assert param.grad is not None def _test_delete_adapter(self, model_id, config_cls, config_kwargs): - supported_peft_types = [ - PeftType.LORA, - PeftType.LOHA, - PeftType.LOKR, - PeftType.IA3, - PeftType.OFT, - PeftType.BOFT, - PeftType.VERA, - PeftType.FOURIERFT, - PeftType.HRA, - PeftType.VBLORA, - PeftType.BONE, - PeftType.MISS, - ] - # IA3 does not support deleting adapters yet, but it just needs to be added - # AdaLora does not support multiple adapters + if config_cls == AdaLoraConfig: + pytest.skip("AdaLoRA does not support multiple adapters") + _skip_if_deleting_adapter_not_supported(config_cls, config_kwargs) + config = config_cls( base_model_name_or_path=model_id, **config_kwargs, ) - if config.peft_type not in supported_peft_types: - return pytest.skip(f"Test not applicable for {config.peft_type}") with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id) @@ -1495,28 +1314,14 @@ def _test_delete_adapter(self, model_id, config_cls, config_kwargs): model.base_model(**input) # should not raise an error def _test_delete_inactive_adapter(self, model_id, config_cls, config_kwargs): - # same as test_delete_adapter, but this time an inactive adapter is deleted - supported_peft_types = [ - PeftType.LORA, - PeftType.LOHA, - PeftType.LOKR, - PeftType.IA3, - PeftType.OFT, - PeftType.BOFT, - PeftType.FOURIERFT, - PeftType.HRA, - PeftType.VBLORA, - PeftType.BONE, - PeftType.MISS, - ] - # IA3 does not support deleting adapters yet, but it just needs to be added - # AdaLora does not support multiple adapters + if config_cls == AdaLoraConfig: + pytest.skip("AdaLoRA does not support multiple adapters") + _skip_if_deleting_adapter_not_supported(config_cls, config_kwargs) + config = config_cls( base_model_name_or_path=model_id, **config_kwargs, ) - if config.peft_type not in supported_peft_types: - return pytest.skip(f"Test not applicable for {config.peft_type}") with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id) @@ -1617,6 +1422,7 @@ def _test_unload_adapter(self, model_id, config_cls, config_kwargs): assert num_params_base == num_params_unloaded def _test_weighted_combination_of_adapters_lora(self, model, config, adapter_list, weight_list): + _skip_if_adding_weighted_adapters_not_supported(config) model.add_adapter(adapter_list[1], config) model.add_adapter(adapter_list[2], replace(config, r=20)) model = model.to(self.torch_device) @@ -1868,7 +1674,7 @@ def _test_weighted_combination_of_adapters(self, model_id, config_cls, config_kw def _test_disable_adapter(self, model_id, config_cls, config_kwargs): task_type = config_kwargs.get("task_type") if (task_type == "SEQ_2_SEQ_LM") and (config_cls in (PromptTuningConfig, PromptEncoderConfig)): - self.skipTest("Seq2Seq + prompt tuning/prompt encoder does not work with disabling adapters") + pytest.skip("Seq2Seq + prompt tuning/prompt encoder does not work with disabling adapters") def get_output(model): # helper function that works with different model types @@ -1947,19 +1753,17 @@ def get_output(model): # TODO: add tests to check if disabling adapters works after calling merge_adapter def _test_adding_multiple_adapters_with_bias_raises(self, model_id, config_cls, config_kwargs): - # When trying to add multiple adapters with bias in Lora, AdaLora or BOFTConfig, an error should be - # raised. Also, the peft model should not be left in a half-initialized state. - if not issubclass(config_cls, (LoraConfig, AdaLoraConfig, BOFTConfig)): - return pytest.skip(f"Test not applicable for {config_cls}") - - with hub_online_once(model_id): - config_kwargs = config_kwargs.copy() - config_kwargs["bias"] = "all" + config_kwargs = config_kwargs.copy() + config_kwargs["bias"] = "all" + try: config = config_cls( base_model_name_or_path=model_id, **config_kwargs, ) + except TypeError: + pytest.skip(f"{config_cls} does not support the 'bias' argument, skipping this test.") + with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id) model = get_peft_model(model, config, "adapter0")