bypass XeTLA implementation on platforms no XMX and add error handling for LLM (#3841)

fengyuan14 · Zhu, Yuhua · web-flow · commit 7d4f297ecb4c · 2024-03-01T22:32:28.000+08:00
* update has_xmx only for xpu device

* qkv_gemm: bypass XeTLA implementation on platforms no XMX equiped

* add torch.xpu.has_xmx() api and error handling for LLM

* Fix flake8

---------

Signed-off-by: Zhu, Yuhua &lt;yuhua.zhu@intel.com&gt;
Signed-off-by: Feng Yuan &lt;feng1.yuan@intel.com&gt;
Co-authored-by: Zhu, Yuhua &lt;yuhua.zhu@intel.com&gt;
diff --git a/csrc/gpu/aten/operators/XeGemm.cpp b/csrc/gpu/aten/operators/XeGemm.cpp
@@ -2,6 +2,7 @@
 #include <ATen/ATen.h>
 #include <ATen/CPUApplyUtils.h>
 #include <ATen/record_function.h>
+#include <runtime/Device.h>
 #include <runtime/Utils.h>
 #include <iostream>
 #include "Linear.h"
@@ -465,7 +466,7 @@ static void mm_qkv_out(
       out1_valid && out2_valid && input_valid && weight_valid && bias_valid &&
       shape_valid;
 
-  if (use_xetla) {
+  if (dpcppGetDeviceHasXMX() && use_xetla) {
     char str__[100];
     if (!has_bias) {
       sprintf(str__, "hgemm_qkv(%d, %d, %d)", m, n, k);
diff --git a/csrc/gpu/utils/Settings.cpp b/csrc/gpu/utils/Settings.cpp
@@ -175,6 +175,11 @@ bool Settings::has_atomic64(int device_id) {
   return dpcppGetDeviceProperties(device_id)->support_atomic64;
 }
 
+bool Settings::has_xmx(int device_id) {
+  // whether XMX is supported in the specified platform.
+  return dpcppGetDeviceHasXMX(device_id);
+}
+
 int Settings::get_verbose_level() const {
   std::lock_guard<std::mutex> lock(s_mutex);
   return static_cast<int>(verbose_level);
diff --git a/csrc/gpu/utils/Settings.h b/csrc/gpu/utils/Settings.h
@@ -46,6 +46,7 @@ class IPEX_API Settings final {
   bool has_fp64_dtype(int device_id = -1);
   bool has_2d_block_array(int device_id = -1);
   bool has_atomic64(int device_id = -1);
+  bool has_xmx(int device_id = -1);
 
   static Settings& I(); // Singleton
 
diff --git a/intel_extension_for_pytorch/csrc/xpu/Module.cpp b/intel_extension_for_pytorch/csrc/xpu/Module.cpp
@@ -663,6 +663,8 @@ void init_xpu_module(pybind11::module& m) {
     return Settings::I().has_2d_block_array(device);
   });
 
+  m.def("_has_xmx", [](int device) { return Settings::I().has_xmx(device); });
+
   m.def(
       "_get_verbose_level", []() { return Settings::I().get_verbose_level(); });
 
diff --git a/intel_extension_for_pytorch/transformers/optimize.py b/intel_extension_for_pytorch/transformers/optimize.py
@@ -586,17 +586,56 @@ def optimize_transformers(
             or re.search("falcon", model.config.architectures[0], re.IGNORECASE)
             or re.search("rw", model.config.architectures[0], re.IGNORECASE)
         ) and device == "cpu"
-        # bypass_ref_model = (re.search("Bloom", model.config.architectures[0], re.IGNORECASE)) or device == "xpu"
-        xpu_supported_model = (
-            re.search("GPTJ", model.config.architectures[0], re.IGNORECASE)
-            or re.search("llama", model.config.architectures[0], re.IGNORECASE)
-            or re.search("OPT", model.config.architectures[0], re.IGNORECASE)
-            or re.search("Bloom", model.config.architectures[0], re.IGNORECASE)
-        ) and device == "xpu"
-        if not (well_supported_model or xpu_supported_model):
+
+        # If the XPU platform does not have XMX, such as PVC1550vg, ipex.optimize_transformers is not supported.
+        # If the XPU platform has XMX and 2D load instructions, such as PVC1100, PVC1100c, and PVC1550,
+        # ipex.optimize_transformers supports GPT-J, Llama, OPT, Bloom, Falcon, QWen
+        xpu_2d_load_supported_model = (
+            (
+                re.search("GPTJ", model.config.architectures[0], re.IGNORECASE)
+                or re.search("llama", model.config.architectures[0], re.IGNORECASE)
+                or re.search("OPT", model.config.architectures[0], re.IGNORECASE)
+                or re.search("Bloom", model.config.architectures[0], re.IGNORECASE)
+                or re.search("Falcon", model.config.architectures[0], re.IGNORECASE)
+                or re.search("QWen", model.config.architectures[0], re.IGNORECASE)
+                or re.search("Baichuan", model.config.architectures[0], re.IGNORECASE)
+            )
+            and device == "xpu"
+            and ipex._C._has_2d_block_array(0)
+            and ipex._C._has_xmx(0)
+        )
+
+        # If the XPU platform has XMX but no 2D load instructions, such as ATS-M and ARC,
+        # ipex.optimize_transformers supports GPT-J, Llama, QWen.
+        xpu_non_2d_load_supported_model = (
+            (
+                re.search("GPTJ", model.config.architectures[0], re.IGNORECASE)
+                or re.search("llama", model.config.architectures[0], re.IGNORECASE)
+                or re.search("QWen", model.config.architectures[0], re.IGNORECASE)
+            )
+            and device == "xpu"
+            and not ipex._C._has_2d_block_array(0)
+            and ipex._C._has_xmx(0)
+        )
+
+        if not (
+            well_supported_model
+            or xpu_2d_load_supported_model
+            or xpu_non_2d_load_supported_model
+        ):
             warnings.warn(
-                "optimize_transformers supports GPT-J/Llama/OPT/Bloom in XPU and Llama/GPT-J/GPT-Neox/Falcon/OPT"
-                " in CPU, fallback to origin model"
+                "The compatibility of ipex.optimize_transformers depends on the CPU/XPU platform "
+                " and the transformer model. Here are the general rules: "
+                " If the XPU platform does not have XMX, such as PVC1550vg, "
+                " ipex.optimize_transformers is not supported. "
+                " If the XPU platform has XMX and 2D load instructions, such as PVC1100, PVC1100c, and PVC1550,"
+                " ipex.optimize_transformers supports GPT-J/Llama/OPT/Bloom/Falcon/QWen, "
+                " and BasicTransformerBlock of diffusers. "
+                " If the XPU platform has XMX but no 2D load instructions, such as ATS-M and ARC, "
+                " ipex.optimize_transformers supports GPT-J/Llama/QWen, "
+                " and BasicTransformerBlock of diffusers. "
+                " If the platform is CPU, "
+                " ipex.optimize_transformers supports Llama, GPT-J, GPT-Neox, Falcon, and OPT."
             )
             return model
 
@@ -655,7 +694,9 @@ def optimize_transformers(
             xpu_woq = True
 
         # model reference conversion
-        if not (xpu_supported_model or xpu_woq):
+        if not (
+            xpu_2d_load_supported_model or xpu_non_2d_load_supported_model or xpu_woq
+        ):
             _model = model_convert_reference(_model)
 
         # model quantization if needed
diff --git a/intel_extension_for_pytorch/xpu/utils.py b/intel_extension_for_pytorch/xpu/utils.py
@@ -88,6 +88,11 @@ def has_2d_block_array(device: int = -1) -> bool:
     return _C._has_2d_block_array(device)
 
 
+def has_xmx(device: int = -1) -> bool:
+    r"""Returns a bool indicating if the platform supports xmx"""
+    return _C._has_xmx(device)
+
+
 # Basic OnOff
 class OnOff:
     def __init__(self, checker, enable, disable):