remove XPULinearXXX class definition for ipex (#1212)

kaixuanliu · web-flow · commit 6a5a01ecdb1e · 2025-03-27T19:03:47.000+01:00
* remove useless code

Signed-off-by: Liu, Kaixuan &lt;kaixuan.liu@intel.com&gt;

* fix xpu dtype tests error

Signed-off-by: Liu, Kaixuan &lt;kaixuan.liu@intel.com&gt;

---------

Signed-off-by: Liu, Kaixuan &lt;kaixuan.liu@intel.com&gt;
diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py
@@ -32,7 +32,7 @@
 
 logger = logging.getLogger(__name__)
 
-_IPEX_MINIMUM_VERSION_FOR_PATCHING = "2.4.0"
+_IPEX_MINIMUM_VERSION_FOR_PATCHING = "2.6.0"
 _accelerate_added_attributes = ["to", "xpu"]
 
 
@@ -52,92 +52,6 @@
     )
 
 
-# TODO: Following XPULinearXXX op classes will be put into ipex after 2.6.0 version
-class XPULinear2SiluMul(torch.nn.Module):
-    def __init__(
-        self,
-        gate_proj: torch.nn.Module,
-        up_proj: torch.nn.Module,
-    ):
-        super().__init__()
-        self.gate_proj_weight = gate_proj.weight.transpose(0, 1).contiguous()
-        self.up_proj_weight = up_proj.weight.transpose(0, 1).contiguous()
-        self.gate_proj_bias = gate_proj.bias
-        self.up_proj_bias = up_proj.bias
-
-    def forward(
-        self,
-        hidden_states,
-    ):
-        up = torch.ops.torch_ipex.mm_silu(hidden_states, self.gate_proj_weight)
-        if self.gate_proj_bias is not None:
-            up += self.gate_proj_bias
-        hidden_states = torch.ops.torch_ipex.mm_resmul(hidden_states, self.up_proj_weight, up)
-        if self.up_proj_bias is not None:
-            hidden_states += self.up_proj_bias
-        return hidden_states
-
-
-class XPULinearGelu(torch.nn.Module):
-    def __init__(self, module: torch.nn.Module):
-        super().__init__()
-        self.weight = module.weight.transpose(0, 1).contiguous()
-        self.bias = module.bias
-
-    def forward(self, x):
-        return torch.ops.torch_ipex.matmul_gelu(x, self.weight, self.bias, 1.0, "tanh")
-
-
-class XPULinearAdd(torch.nn.Module):
-    def __init__(
-        self,
-        module: torch.nn.Module,
-    ):
-        super().__init__()
-        self.weight = module.weight.transpose(0, 1).contiguous()
-        self.bias = module.bias
-
-    def forward(
-        self,
-        hidden_states,
-        residual,
-    ):
-        token_len, _ = hidden_states.size()
-        if residual is None:
-            hidden_states = torch.matmul(hidden_states, self.weight)
-            if self.bias is not None:
-                hidden_states += self.bias
-        else:
-            if self.bias is not None:
-                hidden_states = torch.ops.torch_ipex.mm_bias_resadd(
-                    hidden_states, self.weight, self.bias, 1.0, residual, 1.0
-                )
-            else:
-                hidden_states = torch.addmm(
-                    residual.flatten(0, -2),
-                    hidden_states.flatten(0, -2),
-                    self.weight,
-                    beta=1.0,
-                )
-        hidden_states = hidden_states.view(token_len, -1)
-        return hidden_states
-
-
-class XPUlinearAddAdd(torch.nn.Module):
-    def __init__(self, module: torch.nn.Module):
-        super().__init__()
-        self.weight = module.weight.transpose(0, 1).contiguous()
-        self.bias = module.bias
-
-    def forward(self, x, y, z):
-        if self.bias is not None:
-            x = torch.ops.torch_ipex.mm_bias_resadd(x, self.weight, self.bias, 1.0, y, 1.0)
-            x += z
-        else:
-            x = torch.ops.torch_ipex.mm_bias_resadd(x, self.weight, z, 1.0, y, 1.0)
-        return x
-
-
 # Adapted from https://github.com/huggingface/accelerate/blob/v1.2.1/src/accelerate/hooks.py#L183
 def _remove_hooks_for_ipex(module, recurse):
     if hasattr(module, "_hf_hook"):
@@ -885,11 +799,9 @@ def __init__(self, module, device, config) -> None:
             self.q_slice = self.q_proj.weight.shape[0]
             self.k_slice = self.q_slice + self.k_proj.weight.shape[0]
             self.v_slice = self.k_slice + self.v_proj.weight.shape[0]
+
             if not config.compile and module.o_proj.__class__.__name__ not in ["LinearAllreduce"]:
-                if self.module_device.type == "cpu":
-                    self.mha_linear_add = LinearAdd(module.o_proj)
-                elif self.module_device.type == "xpu":
-                    self.mha_linear_add = XPULinearAdd(module.o_proj)
+                self.mha_linear_add = LinearAdd(module.o_proj)
 
     def qkv_gemm(self, hidden_states):
         if hasattr(self, "concat_qkv"):
@@ -940,13 +852,8 @@ def __init__(self, module, device, config) -> None:
             self.c_proj_linear = nn.Linear(self.c_proj.weight.shape[0], self.c_proj.weight.shape[1])
             self.c_proj_linear.weight = nn.Parameter(self.c_proj.weight.t())
             self.c_proj_linear.bias = self.c_proj.bias
-            if self.module_device.type == "cpu":
-                if self.c_proj_linear not in ["LinearAllreduce"]:
-                    self.linear_add = LinearAdd(self.c_proj_linear)
-
-            elif self.module_device.type == "xpu":
-                if self.c_proj_linear not in ["LinearAllreduce"]:
-                    self.linear_add = XPULinearAdd(self.c_proj_linear)
+            if self.c_proj_linear not in ["LinearAllreduce"]:
+                self.linear_add = LinearAdd(self.c_proj_linear)
 
     def qkv_gemm(self, hidden_states):
         if hasattr(self, "c_attn_linear"):
@@ -977,17 +884,12 @@ def __init__(self, module, device, config) -> None:
         _setattr_from_module(self, module)
         self.config = config
         self.module_device = device
+
         if not config.compile and getattr(config, "quantization_config", None) is None:
-            if self.module_device.type == "cpu":
-                # LinearAllreduce and LinearLayer cannot use fused op LinearAdd
-                if module.down_proj.__class__.__name__ not in ["LinearAllreduce"]:
-                    self.mlp_linear_add = LinearAdd(module.down_proj)
-                self.linear_silu_mul = Linear2SiluMul(module.gate_proj, module.up_proj)
-            elif self.module_device.type == "xpu":
-                # LinearAllreduce and LinearLayer cannot use fused op LinearAdd
-                if module.down_proj.__class__.__name__ not in ["LinearAllreduce"]:
-                    self.mlp_linear_add = XPULinearAdd(module.down_proj)
-                self.linear_silu_mul = XPULinear2SiluMul(module.gate_proj, module.up_proj)
+            # LinearAllreduce and LinearLayer cannot use fused op LinearAdd
+            if module.down_proj.__class__.__name__ not in ["LinearAllreduce"]:
+                self.mlp_linear_add = LinearAdd(module.down_proj)
+            self.linear_silu_mul = Linear2SiluMul(module.gate_proj, module.up_proj)
 
     def forward(self, hidden_states: torch.Tensor, residual: torch.Tensor = None, **kwargs):
         if hasattr(self, "linear_silu_mul"):
@@ -1012,15 +914,10 @@ def __init__(self, module, device, config) -> None:
         self.module_device = device
         if not config.compile and getattr(config, "quantization_config", None) is None:
             # LinearAllreduce and LinearLayer cannot use fused op LinearAdd
-            if self.module_device.type == "cpu":
-                self.linear_gelu = LinearGelu(module.dense_h_to_4h)
-            elif self.module_device.type == "xpu":
-                self.linear_gelu = XPULinearGelu(module.dense_h_to_4h)
+            self.linear_gelu = LinearGelu(module.dense_h_to_4h)
+
             if module.dense_4h_to_h.__class__.__name__ not in ["LinearAllreduce"]:
-                if self.module_device.type == "cpu":
-                    self.linear_add_add = LinearAddAdd(module.dense_4h_to_h)
-                elif self.module_device.type == "xpu":
-                    self.linear_add_add = XPUlinearAddAdd(module.dense_4h_to_h)
+                self.linear_add_add = LinearAddAdd(module.dense_4h_to_h)
 
     def forward(
         self,
@@ -1059,11 +956,9 @@ def __init__(self, module, device, config) -> None:
             self.c_proj_linear.bias = self.c_proj.bias
             if self.module_device.type == "cpu":
                 self.linear_new_gelu = LinearNewGelu(self.c_fc_linear)
-                if self.c_proj_linear not in ["LinearAllreduce"]:
-                    self.linear_add = LinearAdd(self.c_proj_linear)
-            elif self.module_device.type == "xpu":
-                if self.c_proj_linear not in ["LinearAllreduce"]:
-                    self.linear_add = XPULinearAdd(self.c_proj_linear)
+
+            if self.c_proj_linear not in ["LinearAllreduce"]:
+                self.linear_add = LinearAdd(self.c_proj_linear)
 
     def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
         if hasattr(self, "linear_new_gelu"):
@@ -1232,11 +1127,9 @@ def __init__(self, module, device, config):
         super().__init__()
         _setattr_from_module(self, module)
         self.module_device = device
+
         if not config.compile and getattr(config, "quantization_config", None) is None:
-            if self.module_device.type == "cpu":
-                self.linear_gelu = LinearGelu(module.dense)
-            elif self.module_device.type == "xpu":
-                self.linear_gelu = XPULinearGelu(module.dense)
+            self.linear_gelu = LinearGelu(module.dense)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         if hasattr(self, "linear_gelu"):
diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py
@@ -436,9 +436,10 @@ def test_compare_with_and_without_past_key_values(self):
     @parameterized.expand(IPEX_PATCHED_SUPPORTED_ARCHITECTURES)
     def test_patched_model(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
+        dtype = torch.float16 if IS_XPU_AVAILABLE else torch.float32
         patched_model_id = MODEL_NAMES["patched_" + model_arch]
-        ipex_model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, device_map=DEVICE)
-        exported_model = IPEXModelForCausalLM.from_pretrained(patched_model_id, device_map=DEVICE)
+        ipex_model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, torch_dtype=dtype, device_map=DEVICE)
+        exported_model = IPEXModelForCausalLM.from_pretrained(patched_model_id, torch_dtype=dtype, device_map=DEVICE)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
         tokens = tokenizer("This is a sample", return_tensors="pt").to(DEVICE)
         ipex_outputs = ipex_model.generate(