Add IPEX patch fusion linear for bert and vit (#786)

jiqing-feng · web-flow · commit 69b5ea50660d · 2024-07-05T16:36:18.000+02:00
* add bert for question answering

* fix check

* add ViT model for image classification

* fix tasks

* fix variable name

* add test patching

* add vit patching tests

* fix name

* skip testing patch if ipex &lt; 2.3

* fix traced model patch check
diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py
@@ -12,19 +12,22 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+from transformers.models.bert.modeling_bert import BertIntermediate
 from transformers.models.llama.modeling_llama import (
     LlamaDecoderLayer,
     LlamaForCausalLM,
     LlamaModel,
     LlamaRMSNorm,
 )
+from transformers.models.vit.modeling_vit import ViTIntermediate
 
 from optimum.intel.utils.import_utils import is_ipex_version, is_transformers_version
 
 from .modeling_utils import (
     _IPEX_MINIMUM_VERSION_FOR_PATCHING,
+    _ipex_rms_layer_norm_forward,
+    _IPEXIntermediate,
     _IPEXLlamaDecoderLayer,
-    _llama_layer_norm_forward,
     _llama_model_forward,
 )
 
@@ -33,8 +36,7 @@
 _TRANSFORMERS_MIN_VERSION = "4.39.0"
 _TRANSFORMERS_MAX_VERSION = "4.41.2"
 
-_IPEX_EXPORTED_ARCH = ("LlamaForCausalLM",)
-_IPEX_EXPORTED_TASK = ("text-generation",)
+_IPEX_EXPORTED_GENERATION_TASKS = ("text-generation",)
 
 
 def convert_func(m, func_name, new_function):
@@ -49,7 +51,7 @@ def convert_functions(m, target_m, new_function_name, new_function):
         convert_functions(sub_m, target_m, new_function_name, new_function)
 
 
-def convert_class(m, target_m, new_class, config):
+def convert_class(m, target_m, new_class, config=None):
     for name, sub_m in m.named_children():
         if isinstance(sub_m, target_m):
             new_m = new_class(sub_m, config)
@@ -65,6 +67,23 @@ def patch_op(m, target_m, new_op_name, new_op):
 
 
 def _patch_llama_model(model):
+    convert_functions(model, LlamaModel, "forward", _llama_model_forward)
+    convert_functions(model, LlamaRMSNorm, "forward", _ipex_rms_layer_norm_forward)
+    convert_class(model, LlamaDecoderLayer, _IPEXLlamaDecoderLayer, model.config)
+    return model
+
+
+def _patch_bert_model(model):
+    convert_class(model, BertIntermediate, _IPEXIntermediate)
+    return model
+
+
+def _patch_vit_model(model):
+    convert_class(model, ViTIntermediate, _IPEXIntermediate)
+    return model
+
+
+def _patch_model(model):
     if is_ipex_version("<", _IPEX_MINIMUM_VERSION_FOR_PATCHING):
         raise ImportError(f"Only ipex version >= {_IPEX_MINIMUM_VERSION_FOR_PATCHING} supports llama model patching")
     if is_transformers_version("<", _TRANSFORMERS_MIN_VERSION) or is_transformers_version(
@@ -73,13 +92,10 @@ def _patch_llama_model(model):
         raise ImportError(
             f"Only transformers versions {_TRANSFORMERS_MIN_VERSION} ~ {_TRANSFORMERS_MAX_VERSION} are verified."
         )
-    convert_functions(model, LlamaModel, "forward", _llama_model_forward)
-    convert_functions(model, LlamaRMSNorm, "forward", _llama_layer_norm_forward)
-    convert_class(model, LlamaDecoderLayer, _IPEXLlamaDecoderLayer, model.config)
-    return model
-
-
-def _patch_model(model):
     if isinstance(model, LlamaForCausalLM):
         model = _patch_llama_model(model)
+    elif model.config.model_type == "bert":
+        model = _patch_bert_model(model)
+    elif model.config.model_type == "vit":
+        model = _patch_vit_model(model)
     return model
diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+import logging
 import math
 from typing import List, Optional, Tuple, Union
 
@@ -25,11 +26,27 @@
 from optimum.intel.utils.modeling_utils import _setattr_from_module
 
 
+logger = logging.getLogger(__name__)
+
 _IPEX_MINIMUM_VERSION_FOR_PATCHING = "2.3.0"
 
 
+if is_ipex_version("<", _IPEX_MINIMUM_VERSION_FOR_PATCHING):
+    logger.warning(
+        f"Please upgrade the IPEX version to at least {_IPEX_MINIMUM_VERSION_FOR_PATCHING} if you want to patch the model."
+    )
+else:
+    from intel_extension_for_pytorch.llm.modules import (
+        IndirectAccessKVCacheAttention,
+        Linear2SiluMul,
+        LinearAdd,
+        LinearGelu,
+        RotaryEmbedding,
+    )
+
+
 # Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L83
-def _llama_layer_norm_forward(self, hidden_states):
+def _ipex_rms_layer_norm_forward(self, hidden_states):
     return torch.ops.torch_ipex.rmsnorm(hidden_states, self.weight, self.variance_epsilon)
 
 
@@ -139,14 +156,9 @@ def _llama_model_forward(
 # Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L321
 class _IPEXLlamaAttention(nn.Module):
     def __init__(self, module, config) -> None:
-        if is_ipex_version("<", _IPEX_MINIMUM_VERSION_FOR_PATCHING):
-            raise ImportError(
-                f"Only ipex version > {_IPEX_MINIMUM_VERSION_FOR_PATCHING} supports IndirectAccessKVCacheAttention, LinearAdd, RotaryEmbedding"
-            )
         super().__init__()
         _setattr_from_module(self, module)
         self.config = config
-        from intel_extension_for_pytorch.llm.modules import IndirectAccessKVCacheAttention, LinearAdd, RotaryEmbedding
 
         if module.o_proj.__class__.__name__ not in ["LinearAllreduce"]:
             self.mha_linear_add = LinearAdd(module.o_proj)
@@ -296,14 +308,9 @@ def forward(
 # Adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L186
 class _IPEXLlamaMLP(nn.Module):
     def __init__(self, module, config) -> None:
-        if is_ipex_version("<", _IPEX_MINIMUM_VERSION_FOR_PATCHING):
-            raise ImportError(
-                f"Only ipex version > {_IPEX_MINIMUM_VERSION_FOR_PATCHING} supports Linear2SiluMul, LinearAdd"
-            )
         super().__init__()
         _setattr_from_module(self, module)
         self.config = config
-        from intel_extension_for_pytorch.llm.modules import Linear2SiluMul, LinearAdd
 
         # LinearAllreduce and LinearLayer cannot use fused op LinearAdd
         if module.down_proj.__class__.__name__ not in ["LinearAllreduce"]:
@@ -398,3 +405,16 @@ def forward(
             outputs += (present_key_value,)
 
         return outputs
+
+
+# Adapted from https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/bert/modeling_bert.py#L524
+class _IPEXIntermediate(nn.Module):
+    def __init__(self, module, config):
+        super().__init__()
+        _setattr_from_module(self, module)
+        self.linear_gelu = LinearGelu(module.dense)
+        del self.__dict__["_modules"]["dense"]
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.linear_gelu(hidden_states)
+        return hidden_states
diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
@@ -51,7 +51,11 @@
 from optimum.modeling_base import OptimizedModel
 from optimum.utils import NormalizedConfigManager
 
-from ...exporters.ipex.model_patcher import _IPEX_EXPORTED_TASK, _IPEX_MINIMUM_VERSION_FOR_PATCHING, _patch_model
+from ...exporters.ipex.model_patcher import (
+    _IPEX_EXPORTED_GENERATION_TASKS,
+    _IPEX_MINIMUM_VERSION_FOR_PATCHING,
+    _patch_model,
+)
 from ..generation.modeling import prepare_jit_inputs
 from ..utils.import_utils import is_ipex_version, is_torch_version, is_transformers_version
 from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS, recursive_to_device
@@ -60,7 +64,7 @@
 logger = logging.getLogger(__name__)
 
 
-_IPEX_SUPPORT_MODEL_TYPES = ("llama",)
+_IPEX_SUPPORT_MODEL_TYPES = ("llama", "bert", "vit")
 _IPEX_EXPORTED_GENERATION_METHODS = ("sample", "greedy_search", "beam_sample", "beam_search", "assisted_generation")
 
 
@@ -70,17 +74,22 @@ def _is_patched_with_ipex(model, task):
 
     if isinstance(model, torch.jit.ScriptModule):
         for node in model.graph.nodes():
-            # Jit will record the codes position so we can check if the node use ipex exporter.
-            if "torch_ipex::rotary_position_embedding" in node.__str__():
+            # Only patched model enabled fusion linear.
+            if "/fusions/" in node.__str__():
                 return True
         return False
-    else:
+    elif task in _IPEX_EXPORTED_GENERATION_TASKS and model.config.hidden_size < 64:
         # The ipex IAKV op in patched model requires the hidden size at least 64
-        return (
-            model.config.model_type in _IPEX_SUPPORT_MODEL_TYPES
-            and task in _IPEX_EXPORTED_TASK
-            and model.config.hidden_size >= 64
-        )
+        return False
+
+    return model.config.model_type in _IPEX_SUPPORT_MODEL_TYPES
+
+
+def _prepare_inputs_for_ipex_model(model, task, use_cache):
+    if task in _IPEX_EXPORTED_GENERATION_TASKS and _is_patched_with_ipex(model, task):
+        return get_dummy_input(model, return_dict=True)
+    else:
+        return prepare_jit_inputs(model, task, use_cache)
 
 
 def ipex_jit_trace(model, task, use_cache):
@@ -90,12 +99,8 @@ def ipex_jit_trace(model, task, use_cache):
 
     if _is_patched_with_ipex(model, task):
         model = _patch_model(model)
-        # TODO: integerate in prepare_jit_inputs.
-        sample_inputs = get_dummy_input(model, return_dict=True)
-        # Use Tensor Processing Primitives to accelerate linear, see https://arxiv.org/abs/2104.05755.
-        _enable_tpp()
-    else:
-        sample_inputs = prepare_jit_inputs(model, task, use_cache)
+
+    sample_inputs = _prepare_inputs_for_ipex_model(model, task, use_cache)
 
     model.config.return_dict = False
 
@@ -104,6 +109,8 @@ def ipex_jit_trace(model, task, use_cache):
         if not use_cache:
             sample_inputs.pop("past_key_values")
 
+    # Use Tensor Processing Primitives to accelerate linear, see https://arxiv.org/abs/2104.05755.
+    _enable_tpp()
     model = ipex.optimize(model.eval(), dtype=model.dtype, inplace=True)
     # Disable repack while jit tracing to reduce the memory
     ipex._C.disable_jit_linear_repack()
diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py
@@ -188,6 +188,21 @@ def test_pipeline(self, model_arch):
         self.assertGreaterEqual(outputs["score"], 0.0)
         self.assertIsInstance(outputs["answer"], str)
 
+    @unittest.skipIf(is_ipex_version("<", "2.3.0"), reason="Only ipex version > 2.3.0 supports ipex model patching")
+    def test_patched_model(self):
+        ipex_model = IPEXModelForQuestionAnswering.from_pretrained(
+            "Jiqing/patched_tiny_random_bert_for_question_answering"
+        )
+        transformers_model = AutoModelForQuestionAnswering.from_pretrained("hf-internal-testing/tiny-random-bert")
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
+        inputs = "This is a sample input"
+        tokens = tokenizer(inputs, return_tensors="pt")
+        with torch.no_grad():
+            transformers_outputs = transformers_model(**tokens)
+        outputs = ipex_model(**tokens)
+        self.assertTrue(torch.allclose(outputs.start_logits, transformers_outputs.start_logits, atol=1e-4))
+        self.assertTrue(torch.allclose(outputs.end_logits, transformers_outputs.end_logits, atol=1e-4))
+
 
 class IPEXModelForCausalLMTest(unittest.TestCase):
     IPEX_MODEL_CLASS = IPEXModelForCausalLM
@@ -458,3 +473,18 @@ def test_pipeline(self, model_arch):
         self.assertEqual(pipe.device, model.device)
         self.assertGreaterEqual(outputs[0]["score"], 0.0)
         self.assertTrue(isinstance(outputs[0]["label"], str))
+
+    @unittest.skipIf(is_ipex_version("<", "2.3.0"), reason="Only ipex version > 2.3.0 supports ipex model patching")
+    def test_patched_model(self):
+        ipex_model = IPEXModelForImageClassification.from_pretrained(
+            "Jiqing/patched_tiny_random_vit_for_image_classification"
+        )
+        transformers_model = self.IPEX_MODEL_CLASS.from_pretrained("hf-internal-testing/tiny-random-vit")
+        preprocessor = AutoFeatureExtractor.from_pretrained("hf-internal-testing/tiny-random-vit")
+        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        image = Image.open(requests.get(url, stream=True).raw)
+        inputs = preprocessor(images=image, return_tensors="pt")
+        with torch.no_grad():
+            transformers_outputs = transformers_model(**inputs)
+        outputs = ipex_model(**inputs)
+        self.assertTrue(torch.allclose(outputs.logits, transformers_outputs.logits, atol=1e-4))