Fix default Lora/ (IA)^3 scaling in forward (#770)

calpt · TimoImhof · web-flow · commit f0ca96200d40 · 2024-12-30T00:12:37.000+01:00
Resolves issue described in #760. **IMPORTANT**: this fix restores weights compatibility with adapter-transformers. Compatibility to previous adapters versions is kept via a compat patch. ## Details The current implementation of LoRA/ (IA)^3 in `adapters ` versions < 1.1.0 does not correctly implement adapter states scaling via the LoRA `alpha` attribute, effectively ignoring `alpha` and always applying a scaling of 1.0. This PR restores the correct original behavior as found in adapter-transformers/ original LoRA implementation. As this change breaks all adapters pre-trained using `adapters` versions 0.1.0 - 1.0.1, a backward compatibility patch is introduced that automatically sets `alpha = r` for LoRAs for adapters that were trained using affected versions. This ensures all previous adapters continue to behave exactly as trained (ie give the exact same output using newer versions). --------- Co-authored-by: TimoImhof <62378375+TimoImhof@users.noreply.github.com>
diff --git a/setup.py b/setup.py
@@ -34,6 +34,7 @@
     "isort>=5.5.4",
     "Jinja2==2.11.3",
     "nltk",
+    "packaging",
     "parameterized",
     "pillow",
     "protobuf",
@@ -136,11 +137,12 @@ def deps_list(*pkgs):
 # when modifying the following list, make sure to update src/transformers/dependency_versions_check.py
 install_requires = [
     deps["transformers"],
+    deps["packaging"],
 ]
 
 setup(
     name="adapters",
-    version="1.0.1",
+    version="1.1.0.dev0",
     author="The AdapterHub team and community contributors",
     author_email="calpt@mail.de",
     description="A Unified Library for Parameter-Efficient and Modular Transfer Learning",
diff --git a/src/adapters/__init__.py b/src/adapters/__init__.py
@@ -16,7 +16,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "1.0.1"
+__version__ = "1.1.0.dev0"
 
 from typing import TYPE_CHECKING
 
diff --git a/src/adapters/loading.py b/src/adapters/loading.py
@@ -6,6 +6,7 @@
 from typing import Callable, Mapping, Optional, Sequence, Tuple
 
 import torch
+from packaging.version import Version
 
 
 try:
@@ -368,6 +369,23 @@ def _rename_legacy_weights(self, k):
             k = k.replace(old, new)
         return k
 
+    def _fix_backward_compat(self, config):
+        # Fix error in previous versions for LoRA/ (IA)^3
+        ADAPTER_PREFIX = "adapters."
+        MIN_VERSION = Version("1.1.0")
+
+        version = config.get("version", "")
+        if version.startswith(ADAPTER_PREFIX) and Version(version[len(ADAPTER_PREFIX) :]) < MIN_VERSION:
+            if (
+                config["config"].get("architecture", None) == "lora"
+                and config["config"]["r"] != config["config"]["alpha"]
+            ):
+                logger.warning(
+                    "Loading a LoRA trained using a faulty scaling implementation of a previous library version. Editing the configuration to make sure the adapter works as trained."
+                    "See https://github.com/adapter-hub/adapters/pull/770 for more."
+                )
+                config["config"]["alpha"] = config["config"]["r"]
+
     # This method is used to remove unnecessary invertible adapters from task adapters using the old format.
     # In the old format, task adapters e.g. using seq_bn config specify inv. adapters but don't use them.
     # As inv. adapters would be incorrectly used in the new implementation,
@@ -560,6 +578,8 @@ def load(
                 # The conversion to a set and then back to a list removes all duplicates
                 leave_out = list(set(leave_out + config["config"]["leave_out"]))
             config["config"]["leave_out"] = leave_out
+        # Fix issues
+        self._fix_backward_compat(config)
 
         adapter_name = load_as or config["name"]
         # If the adapter is not part of the model, add it
diff --git a/src/adapters/methods/lora.py b/src/adapters/methods/lora.py
@@ -100,6 +100,7 @@ def forward(self, hidden_states: Optional[torch.Tensor], layer_input: torch.Tens
             hidden_states = hidden_states * gate
         else:
             gate = None
+            hidden_states = hidden_states * self.scaling
 
         return hidden_states, gate
 
@@ -171,6 +172,7 @@ def forward(self, hidden_states: Optional[torch.Tensor], layer_input: torch.Tens
             hidden_states = hidden_states * gate
         else:
             gate = None
+            hidden_states = hidden_states * self.scaling
 
         return hidden_states, gate