Update llama and gemma patching for resolving bf16 execution issues with sin/cos (#783)

eaidova · echarlaix · web-flow · commit cb2f2eca18da · 2024-07-15T11:59:19.000+02:00
* update llama and gemma patching for resolving bf16 execution issues

* fix model patcher

* update tests

* Update tests/openvino/test_modeling.py

Co-authored-by: Ella Charlaix &lt;80481427+echarlaix@users.noreply.github.com&gt;

* Update optimum/exporters/openvino/model_patcher.py

Co-authored-by: Ella Charlaix &lt;80481427+echarlaix@users.noreply.github.com&gt;

* apply review comments

* Update optimum/exporters/openvino/model_patcher.py

* format

* Update optimum/exporters/openvino/model_patcher.py

* fix failing test

* Update tests/openvino/test_exporters_cli.py

Co-authored-by: Ella Charlaix &lt;80481427+echarlaix@users.noreply.github.com&gt;

---------

Co-authored-by: Ella Charlaix &lt;80481427+echarlaix@users.noreply.github.com&gt;
diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -49,7 +49,6 @@
     ChatGLMModelPatcher,
     CodeGenModelPatcher,
     DBRXModelPatcher,
-    GemmaModelPatcher,
     InternLM2Patcher,
     InternLMModelPatcher,
     JaisModelPatcher,
@@ -319,7 +318,7 @@ class GemmaOpenVINOConfig(GemmaOnnxConfig):
     def patch_model_for_export(
         self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
     ) -> "ModelPatcher":
-        return GemmaModelPatcher(self, model, model_kwargs=model_kwargs)
+        return LlamaModelPatcher(self, model, model_kwargs=model_kwargs)
 
 
 @register_in_tasks_manager(
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
@@ -497,50 +497,65 @@ def _llama_gemma_update_causal_mask_latest(
     _llama_gemma_update_causal_mask = _llama_gemma_update_causal_mask_legacy
 
 
-class GemmaModelPatcher(DecoderModelPatcher):
+def llama_gemma_rotary_emb_forward(self, x, position_ids, seq_len=None):
+    # adopted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/gemma/modeling_gemma.py#L104
+    _seq_len = torch.max(position_ids) + 1 if seq_len is None else seq_len
+    if _seq_len > self.embed_positions.shape[0]:
+        if seq_len is None:
+            return self._orig_forward(x, position_ids)
+        else:
+            return self._orig_forward(x, position_ids, seq_len)
+    sincos = self.embed_positions[position_ids]
+    sin, cos = torch.split(sincos, sincos.shape[-1] // 2, dim=-1)
+    return cos, sin
+
+
+class LlamaModelPatcher(DecoderModelPatcher):
     def __enter__(self):
         super().__enter__()
 
-        # gemma has some accuracy issues with bf16 with transformers >= 4.39
+        # llama/gemma has some accuracy issues with bf16 with transformers >= 4.39
         # fill causal mask in slightly different way for avoid overflow on some platforms
         if is_transformers_version(">=", "4.39.0"):
             self._model.model._orig_update_causal_mask = self._model.model._update_causal_mask
             self._model.model._update_causal_mask = types.MethodType(
                 _llama_gemma_update_causal_mask, self._model.model
             )
 
-        # init inv_freq for torchscript tracing
-        # https://github.com/huggingface/transformers/blob/ed74d97871468f3a4695ede50abdc0b55717a84d/src/transformers/models/gemma/modeling_gemma.py#L108
-        for layer in self._model.model.layers:
-            if layer.self_attn.rotary_emb.inv_freq is None:
-                rotary_emb = layer.self_attn.rotary_emb
-                layer.self_attn.rotary_emb.inv_freq = 1.0 / (
-                    rotary_emb.base ** (torch.arange(0, rotary_emb.dim, 2, dtype=torch.int64).float() / rotary_emb.dim)
-                )
+            max_positions = self._model.config.max_position_embeddings
 
-    def __exit__(self, exc_type, exc_value, traceback):
-        super().__exit__(exc_type, exc_value, traceback)
-        if hasattr(self._model.model, "_orig_update_causal_mask"):
-            self._model.model._update_causal_mask = self._model.model._orig_update_causal_mask
+            # cos/sin for rotary position embeddings also having issues with bf16 and efficiency due to calculation on each step
+            # use precomputed
+            def create_sinusoidal_positions(num_pos: int, dim: int, base: int = 10000) -> torch.Tensor:
+                # adopted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L101
+                inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64) / dim))
 
+                sinusoid_inp = torch.einsum(
+                    "i , j -> i j", torch.arange(num_pos, dtype=torch.int64).float(), inv_freq
+                ).float()
+                emb = torch.cat((sinusoid_inp, sinusoid_inp), dim=-1)
+                return torch.cat((torch.sin(emb), torch.cos(emb)), dim=1)
 
-class LlamaModelPatcher(DecoderModelPatcher):
-    def __enter__(self):
-        super().__enter__()
+            base = self._model.model.layers[0].self_attn.rotary_emb.base
+            dim = self._model.model.layers[0].self_attn.rotary_emb.dim
+            embed_positions = create_sinusoidal_positions(max_positions, dim, base)
 
-        # llama has some accuracy issues with bf16 with transformers >= 4.39
-        # fill causal mask in slightly different way for avoid overflow on some platforms
-        if is_transformers_version(">=", "4.39.0"):
-            self._model.model._orig_update_causal_mask = self._model.model._update_causal_mask
-            self._model.model._update_causal_mask = types.MethodType(
-                _llama_gemma_update_causal_mask, self._model.model
-            )
+            for layer in self._model.model.layers:
+                layer.self_attn.rotary_emb.register_buffer("embed_positions", embed_positions)
+                layer.self_attn.rotary_emb._orig_forward = layer.self_attn.rotary_emb.forward
+
+                layer.self_attn.rotary_emb.forward = types.MethodType(
+                    llama_gemma_rotary_emb_forward, layer.self_attn.rotary_emb
+                )
 
     def __exit__(self, exc_type, exc_value, traceback):
         super().__exit__(exc_type, exc_value, traceback)
         if hasattr(self._model.model, "_orig_update_causal_mask"):
             self._model.model._update_causal_mask = self._model.model._orig_update_causal_mask
 
+            for layer in self._model.model.layers:
+                layer.self_attn.rotary_emb.forward = layer.self_attn.rotary_emb._orig_forward
+
 
 SUPPORT_SDPA = is_torch_version(">", "2.1.0")
 
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
@@ -41,7 +41,7 @@
 )
 from optimum.intel.openvino.configuration import _DEFAULT_4BIT_CONFIGS
 from optimum.intel.openvino.utils import _HEAD_TO_AUTOMODELS
-from optimum.intel.utils.import_utils import is_openvino_tokenizers_available
+from optimum.intel.utils.import_utils import is_openvino_tokenizers_available, is_transformers_version
 
 
 class OVCLIExportTestCase(unittest.TestCase):
@@ -90,20 +90,26 @@ class OVCLIExportTestCase(unittest.TestCase):
         ("text-generation-with-past", "opt125m", "int4_asym_g128", 4, 144),
         ("text-generation-with-past", "opt125m", "int4_sym_g64", 4, 144),
         ("text-generation-with-past", "opt125m", "int4_asym_g64", 4, 144),
-        ("text-generation-with-past", "llama_awq", "int4 --ratio 1.0 --sym --group-size 16 --all-layers", 0, 32),
+        (
+            "text-generation-with-past",
+            "llama_awq",
+            "int4 --ratio 1.0 --sym --group-size 8 --all-layers",
+            0,
+            32 if is_transformers_version("<", "4.39.0") else 34,
+        ),
         (
             "text-generation-with-past",
             "llama_awq",
             "int4 --ratio 1.0 --sym --group-size 16 --awq --dataset wikitext2 --num-samples 100 "
             "--sensitivity-metric max_activation_variance",
-            4,
+            6 if is_transformers_version(">=", "4.39") else 4,
             28,
         ),
         (
             "text-generation-with-past",
             "llama_awq",
             "int4 --ratio 1.0 --sym --group-size 16 --scale-estimation --dataset wikitext2 --num-samples 100 ",
-            4,
+            6 if is_transformers_version(">=", "4.39") else 4,
             28,
         ),
     ]
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
@@ -932,15 +932,18 @@ def test_beam_search(self, model_arch):
             do_sample=False,
             eos_token_id=None,
         )
+
         beam_sample_gen_config = GenerationConfig(
             max_new_tokens=10,
             min_new_tokens=10,
             num_beams=4,
             do_sample=True,
             eos_token_id=None,
-            top_k=1,
         )
 
+        if model_arch == "minicpm":
+            beam_sample_gen_config.top_k = 1
+
         group_beam_search_gen_config = GenerationConfig(
             max_new_tokens=10,
             min_new_tokens=10,
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
@@ -230,7 +230,7 @@ class OVWeightCompressionTest(unittest.TestCase):
                 quant_method=QuantizationMethod.AWQ,
                 scale_estimation=True,
             ),
-            16,
+            18 if is_transformers_version(">=", "4.39") else 16,
         ),
         (
             OVModelForCausalLM,
@@ -244,7 +244,7 @@ class OVWeightCompressionTest(unittest.TestCase):
                 dataset="c4",
                 quant_method="awq",
             ),
-            16,
+            18 if is_transformers_version(">=", "4.39") else 16,
         ),
     )