uxlfoundation · dyoussif · Feb 23, 2026 · dzarukin · Feb 24, 2026 · dzarukin
@@ -58,7 +58,7 @@ if (DNNL_ENABLE_PRIMITIVE_GPU_ISA STREQUAL "ALL")
 else()
     foreach(isa ${DNNL_ENABLE_PRIMITIVE_GPU_ISA})
         string(TOUPPER ${isa} uisa)
-        if(NOT "${uisa}" MATCHES "^(XELP|XEHP|XEHPG|XEHPC|XE2|XE3)$")
+        if(NOT "${uisa}" MATCHES "^(XELP|XEHP|XEHPG|XEHPC|XE2|XE3|XE3P)$")
             message(FATAL_ERROR "Unsupported primitive GPU ISA: ${uisa}")
         endif()
         set(BUILD_${uisa} TRUE)

@@ -151,7 +151,7 @@ set(DNNL_ENABLE_PRIMITIVE_GPU_ISA "ALL" CACHE STRING
     implementations will always be available. Valid values:
     - ALL (the default). Includes all ISA to be enabled.
     - <ISA_NAME>;<ISA_NAME>;... Includes only selected ISA to be enabled.
-      Possible values are: XELP, XEHP, XEHPG, XEHPC, XE2, XE3.")
+      Possible values are: XELP, XEHP, XEHPG, XEHPC, XE2, XE3, XE3P.")
 
 set(ONEDNN_ENABLE_GEMM_KERNELS_ISA "ALL" CACHE STRING
     "Specifies an ISA set of GeMM kernels residing in x64/gemm folder to be

@@ -227,6 +227,7 @@
 #cmakedefine01 BUILD_XEHPC
 #cmakedefine01 BUILD_XE2
 #cmakedefine01 BUILD_XE3
+#cmakedefine01 BUILD_XE3P
 // GeMM kernels ISA controls
 #cmakedefine01 BUILD_GEMM_KERNELS_ALL
 #cmakedefine01 BUILD_GEMM_KERNELS_NONE

@@ -125,6 +125,12 @@ if(UNIX)
     endif()
 endif()
 
+# TODO: Remove these after the next pull-down from main.
+if(DNNL_WITH_XE3P)
+add_definitions_with_host_compiler(-DDNNL_WITH_XE3P)
+add_definitions_with_host_compiler(-DXE3P)
+endif()
+
 add_subdirectory(common)
 
 if(NOT DNNL_CPU_RUNTIME STREQUAL "NONE")

@@ -113,8 +113,10 @@ uint8_t float2e3m0(float f) {
             min_diff = diff;
             raw_bits = idx;
         }
-        // Special case for midpoint, we round to even (so even index)
-        if ((diff == min_diff) && !(idx & 1)) raw_bits = idx;
+        // Special case for midpoint:
+        //  - towards 0 for 0.125
+        //  - up for other ties
+        if ((diff == min_diff) && idx != 1) raw_bits = idx;
     }
     assert(raw_bits < 8);
     // reapply sign

@@ -239,4 +239,10 @@
 #define REG_XE3_ISA(...)
 #endif
 
+#if BUILD_PRIMITIVE_GPU_ISA_ALL || BUILD_XE3P
+#define REG_XE3P_ISA(...) __VA_ARGS__
+#else
+#define REG_XE3P_ISA(...)
+#endif
+
 #endif
@@ -19,6 +19,10 @@ file(GLOB SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
     )
 
+if(DNNL_WITH_XE3P)
+add_definitions_with_host_compiler(-DXE3P=1)
+endif()
+
 set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu)
 add_library(${OBJ_LIB} OBJECT ${SOURCES})
 set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS

@@ -45,6 +45,9 @@ uint64_t get_future_extensions(
         case gpu_arch_t::xe2:
         case gpu_arch_t::xe_hpc:
         case gpu_arch_t::xe3:
+        case gpu_arch_t::xe3p_35_10:
+        case gpu_arch_t::xe3p_35_11:
+        case gpu_arch_t::xe3p_35_unknown:
             extensions |= (uint64_t)device_ext_t::intel_global_float_atomics;
             extensions
                     |= (uint64_t)device_ext_t::intel_variable_eu_thread_count;
@@ -109,7 +112,13 @@ bool device_info_t::mayiuse_sub_group(int size) const {
         case gpu_arch_t::xe_lp:
         case gpu_arch_t::xe_hp:
         case gpu_arch_t::xe_hpg: return utils::one_of(size, 8, 16, 32);
-        default: return utils::one_of(size, 16, 32);
+        case gpu_arch_t::xe_hpc:
+        case gpu_arch_t::xe2:
+        case gpu_arch_t::xe3:
+        case gpu_arch_t::xe3p_35_10:
+        case gpu_arch_t::xe3p_35_11:
+        case gpu_arch_t::xe3p_35_unknown: return utils::one_of(size, 16, 32);
+        default: return utils::one_of(size, 32);
     }
 }
 
@@ -145,6 +154,9 @@ int device_info_t::max_eus_per_wg(gpu_arch_t gpu_arch) {
     switch (gpu_arch) {
         case gpu::intel::compute::gpu_arch_t::xe_hpc:
         case gpu::intel::compute::gpu_arch_t::xe2:
+        case gpu_arch_t::xe3p_35_10:
+        case gpu_arch_t::xe3p_35_11:
+        case gpu_arch_t::xe3p_35_unknown:
         case gpu::intel::compute::gpu_arch_t::xe3: return 8;
         case gpu::intel::compute::gpu_arch_t::xe_lp:
         case gpu::intel::compute::gpu_arch_t::xe_hp:
@@ -158,6 +170,9 @@ int device_info_t::max_subgroup_size(gpu_arch_t gpu_arch) {
     switch (gpu_arch) {
         case gpu::intel::compute::gpu_arch_t::xe_hpc:
         case gpu::intel::compute::gpu_arch_t::xe2:
+        case gpu_arch_t::xe3p_35_10:
+        case gpu_arch_t::xe3p_35_11:
+        case gpu_arch_t::xe3p_35_unknown:
         case gpu::intel::compute::gpu_arch_t::xe3: return 32;
         case gpu::intel::compute::gpu_arch_t::xe_lp:
         case gpu::intel::compute::gpu_arch_t::xe_hp:
@@ -179,6 +194,9 @@ int device_info_t::min_subgroup_size() const {
         case gpu_arch_t::xe_hpg: return 8;
         case gpu_arch_t::xe_hpc:
         case gpu_arch_t::xe2:
+        case gpu_arch_t::xe3p_35_10:
+        case gpu_arch_t::xe3p_35_11:
+        case gpu_arch_t::xe3p_35_unknown:
         case gpu_arch_t::xe3: return 16;
         default: return 0;
     }
@@ -188,6 +206,9 @@ int device_info_t::max_exec_size(gpu_arch_t gpu_arch) {
     switch (gpu_arch) {
         case gpu::intel::compute::gpu_arch_t::xe_hpc:
         case gpu::intel::compute::gpu_arch_t::xe2:
+        case gpu::intel::compute::gpu_arch_t::xe3p_35_10:
+        case gpu::intel::compute::gpu_arch_t::xe3p_35_11:
+        case gpu::intel::compute::gpu_arch_t::xe3p_35_unknown:
         case gpu::intel::compute::gpu_arch_t::xe3: return 128;
         default: return 64;
     }
@@ -221,6 +242,9 @@ int device_info_t::threads_per_eu(gpu_arch_t gpu_arch, bool large_grf_mode) {
         case gpu::intel::compute::gpu_arch_t::xe_hpg:
         case gpu::intel::compute::gpu_arch_t::xe_hpc:
         case gpu::intel::compute::gpu_arch_t::xe2:
+        case gpu::intel::compute::gpu_arch_t::xe3p_35_10:
+        case gpu::intel::compute::gpu_arch_t::xe3p_35_11:
+        case gpu::intel::compute::gpu_arch_t::xe3p_35_unknown:
         case gpu::intel::compute::gpu_arch_t::xe3:
             return large_grf_mode ? 4 : 8;
         case gpu::intel::compute::gpu_arch_t::unknown: return 7;
@@ -238,6 +262,11 @@ int device_info_t::max_slm_size(gpu_arch_t gpu_arch) {
         case gpu::intel::compute::gpu_arch_t::xe_hpg:
         case gpu::intel::compute::gpu_arch_t::xe_hpc:
         case gpu::intel::compute::gpu_arch_t::xe2:
+        case gpu::intel::compute::gpu_arch_t::xe3p_35_10:
+        case gpu::intel::compute::gpu_arch_t::xe3p_35_11:
+        case gpu::intel::compute::gpu_arch_t::xe3p_35_unknown:
+            slm_size = 3 * (1 << 17);
+            break;
         case gpu::intel::compute::gpu_arch_t::xe3: slm_size = (1 << 17); break;
         case gpu::intel::compute::gpu_arch_t::unknown: assert(!"not expected");
     }
@@ -269,6 +298,9 @@ size_t device_info_t::icache_size() const {
         case gpu::intel::compute::gpu_arch_t::xe_hpc: return 80 * 1024;
         case gpu::intel::compute::gpu_arch_t::xe2: return 96 * 1024;
         case gpu::intel::compute::gpu_arch_t::xe3: return 96 * 1024;
+        case gpu::intel::compute::gpu_arch_t::xe3p_35_10:
+        case gpu::intel::compute::gpu_arch_t::xe3p_35_11:
+        case gpu::intel::compute::gpu_arch_t::xe3p_35_unknown: return 80 * 1024;
         case gpu::intel::compute::gpu_arch_t::unknown: assert(!"not expected");
     }
     return 0;

@@ -41,7 +41,18 @@ namespace gpu {
 namespace intel {
 namespace compute {
 
-enum class gpu_arch_t { unknown, xe_lp, xe_hp, xe_hpg, xe_hpc, xe2, xe3 };
+enum class gpu_arch_t {
+    unknown,
+    xe_lp,
+    xe_hp,
+    xe_hpg,
+    xe_hpc,
+    xe2,
+    xe3,
+    xe3p_35_10,
+    xe3p_35_11,
+    xe3p_35_unknown,
+};
 
 // Memory for storing ngen::Product to avoid directly including nGEN because of
 // header dependencies outside of src/gpu/intel.
@@ -58,6 +69,9 @@ static inline const char *to_string(gpu_arch_t arch) {
     CASE(xe_hpc);
     CASE(xe2);
     CASE(xe3);
+    CASE(xe3p_35_10);
+    CASE(xe3p_35_11);
+    CASE(xe3p_35_unknown);
     return "unknown";
 #undef CASE
 }
@@ -71,6 +85,9 @@ static inline gpu_arch_t str2gpu_arch(const char *str) {
     CASE(xe_hpc);
     CASE(xe2);
     CASE(xe3);
+    CASE(xe3p_35_10);
+    CASE(xe3p_35_11);
+    CASE(xe3p_35_unknown);
     return gpu_arch_t::unknown;
 #undef CASE
 }
@@ -253,6 +270,8 @@ struct device_info_t {
 
     bool has_native(data_type_t type) const;
 
+    bool is_efficient_64bit() const { return is_efficient_64bit_; }
+
     const std::vector<uint8_t> &get_cache_blob() const {
         return serialized_device_info_.get_data();
     }
@@ -282,6 +301,7 @@ struct device_info_t {
     bool mayiuse_systolic_ = false;
     bool mayiuse_ngen_kernels_ = false;
     bool mayiuse_system_memory_allocators_ = false;
+    bool is_efficient_64bit_ = false;
 
     std::string name_;
     xpu::runtime_version_t runtime_version_;

@@ -1094,6 +1094,8 @@ status_t init_vec_size(config_t &cfg) {
 
 int default_regs(const config_t &cfg) {
     if (!cfg.hw().large_grf_support()) return 128;
+    if (cfg.hw() == ngen::HW::XE3P_35_11 && cfg.is_dpas_or_dpasw_fma())
+        return 512;
     if (cfg.is_dpas_or_dpasw_fma()) return 256;
     return 128;
 }

@@ -251,6 +251,9 @@ class compute_builder_t {
             alloc_updater.update(buf_mgr_);
         }
 
+        // Assign {Fwd} for dpas when applicable.
+        if (cfg_.hw() >= ngen::HW::XE3P_35_10)
+            x2r_mul_stmt_ = inject_dpas_fwd(x2r_mul_stmt_);
         // Assign {Atomic} for dpas(w) when applicable.
         x2r_mul_stmt_ = inject_dpas_atomic(x2r_mul_stmt_);
     }

@@ -59,6 +59,9 @@ hw_t to_hw(ngen::HW hw) {
         case ngen::HW::XeHPC: return hw_t::xehpc;
         case ngen::HW::Xe2: return hw_t::xehpc;
         case ngen::HW::Xe3: return hw_t::xehpc;
+        case ngen::HW::XE3P_35_10:
+        case ngen::HW::XE3P_35_11:
+        case ngen::HW::XE3P_UNKNOWN: return hw_t::xehpc;
         default: gpu_error_not_expected() << "Unknown HW: " << to_string(hw);
     }
     return hw_t::undef;

@@ -1355,7 +1355,8 @@ struct fma_context_t {
         bool is_dpas = is_dp_fma(fma);
         bool is_a = (abc == abc_kind_t::a);
         auto type = (is_a ? a_type : b_type);
-        bool cvt_f16 = (layout.type().is_fp8() || layout.type().is_fp4());
+        bool cvt_f16 = ((hw < ngen::HW::XE3P_35_10 && layout.type().is_fp8())
+                || (hw < ngen::HW::XE3P_35_11 && layout.type().is_fp4()));
         int type_size = (cvt_f16 ? 2 : type.size());
         if (is_dpas) {
             int sdepth = 8;
@@ -2209,11 +2210,51 @@ class plan_builder_t {
         return plan_status_t::success;
     }
 
+    // Extends the view to cover 256 contiguous bytes for more efficient
+    // prefetching.
+    void maybe_extend_prefetch_thread_view_to_256_bytes(
+            view_t &thr_view) const {
+        auto thr_layout = thr_view.create_pseudo_vlayout();
+        auto &blocks = thr_layout.blocks();
+        if (blocks.size() <= 1) return;
+
+        auto &b0 = blocks[0];
+        auto &b1 = blocks[1];
+        if (!b1.stride.is_fixed() || !b0.stride.is_fixed()) return;
+        auto inner_var = thr_view.vvars()[b0.idx];
+        bool is_block_strided
+                = (b0.stride == stride_t(1)) && (b1.stride > b0.size);
+        int type_size = thr_layout.type().size();
+        dim_t full_dim_size
+                = gemm_schedule_.a_view().vdims()[b0.idx] * type_size;
+        bool size_ge_256b = (full_dim_size >= 256);
+        dim_t b0_size = b0.size * type_size;
+        bool prefetch_lt_256b = (b0_size < 256);
+        bool is_inner_loop = gemm_schedule_.is_inner_loop(inner_var);
+        // Extend if the following conditions are satisfied:
+        // - The inner block (b0) is dense and smaller than 256 bytes
+        // - The original tensor has at least 256 bytes across b0 dimension
+        // - The inner block dimensions corresponds to the inner loop
+        //   dimension. We want to prefetch extra cache lines only if they are
+        //   going to be used by the next iterations.
+        if (is_block_strided && size_ge_256b && prefetch_lt_256b
+                && is_inner_loop) {
+            gpu_assert(thr_view.vdims()[b0.idx] == b0.size);
+            int factor = 256 / b0_size;
+            thr_view.set_vdim(inner_var, b0.size * factor,
+                    thr_view.vstart()[b0.idx],
+                    /*overwrite=*/true);
+        }
+    }
+
     plan_status_t init_x_prefetch_plan(abc_kind_t abc, const view_t &tg_view,
             grid_info_t &grid, send_plan_t &prefetch) const {
         if (!use_prefetch(abc)) return plan_status_t::success;
         auto &tg = cfg_.thread_group_grid();
         auto thr_view = tg_view.split(tg, &grid);
+        if (cfg_.hw() == ngen::HW::XE3P_35_11) {
+            maybe_extend_prefetch_thread_view_to_256_bytes(thr_view);
+        }
         auto params = get_send_params(cfg_.options(), send_op_t::prefetch,
                 send_address_t::a64, fma_kind_t::undef, abc, thr_view,
                 gemm_schedule_);

@@ -195,6 +195,7 @@ struct gen_t : public primitive_t {
             // Check GPU architecture.
             bool arch_ok = utils::one_of(arch_, arch_t::xe_lp, arch_t::xe_hp,
                     arch_t::xe_hpg, arch_t::xe_hpc, arch_t::xe2, arch_t::xe3);
+            arch_ok |= (arch_ >= arch_t::xe3p_35_10);
 
             VDISPATCH_GEMM(arch_ok, VERBOSE_UNSUPPORTED_ARCH, "gpu");
             VDISPATCH_GEMM(IMPLICATION(with_binary, arch_ >= arch_t::xe_hp),
@@ -215,7 +216,7 @@ struct gen_t : public primitive_t {
                     || intel_engine->mayiuse(compute::device_ext_t::
                                     intel_subgroup_split_matrix_multiply_accumulate);
 
-            bool is_integrated = intel_engine->device_info()->is_integrated();
+            bool is_integrated = dev_info_->is_integrated();
 
             // Size checks for fused reduction kernels.
             if (with_sum_ab()) {
@@ -260,6 +261,9 @@ struct gen_t : public primitive_t {
                                    !with_eltwise && !with_binary),
                     VERBOSE_UNSUPPORTED_POSTOP);
 
+            if (arch_ >= arch_t::xe3p_35_10)
+                kernel_desc_.set_efficient_64b(dev_info_->is_efficient_64bit());
+
             bool print_verbose = get_verbose(verbose_t::debuginfo) >= 5;
             bool kernel_success = false;
             auto lda = ld(DNNL_ARG_A);

@@ -21,15 +21,16 @@ endif()
 
 # Use oneDNN names for ALL to ensure string replacement functions correctly
 set(GPUS ${DNNL_ENABLE_PRIMITIVE_GPU_ISA})
-string(REPLACE "ALL" "XELP;XEHP;XEHPG;XEHPC;XE2;XE3" GPUS "${GPUS}")
+string(REPLACE "ALL" "XELP;XEHP;XEHPG;XEHPC;XE2;XE3;XE3P" GPUS "${GPUS}")
 string(REPLACE "XELP" "12LP" GPUS "${GPUS}")
 string(REPLACE "XEHPG" "12p7" GPUS "${GPUS}")
 string(REPLACE "XEHPC" "12p8" GPUS "${GPUS}")
 string(REPLACE "XEHP" "12HP" GPUS "${GPUS}")
 string(REPLACE "XE2" "Xe2" GPUS "${GPUS}")
 string(REPLACE "XE3" "Xe3" GPUS "${GPUS}")
+string(REPLACE "XE3P" "Xe3P" GPUS "${GPUS}")
 
-set(ALL_GPUS "12LP;12HP;12p7;12p8;Xe2;Xe3")
+set(ALL_GPUS "12LP;12HP;12p7;12p8;Xe2;Xe3;Xe3P")
 foreach(GPU ${GPUS})
     if(NOT ${GPU} IN_LIST ALL_GPUS)
         message(FATAL_ERROR "Unknown GPU architecture: ${GPU}")
@@ -63,7 +64,7 @@ if(DPCPP_HOST_COMPILER_KIND STREQUAL "DEFAULT")
      )
 
     if (DNNL_ENABLE_PRIMITIVE_GPU_ISA STREQUAL "ALL")
-        set(DNNL_GPU_ISA_LIST "XELP;XEHP;XEHPG;XEHPC;XE2;XE3")
+        set(DNNL_GPU_ISA_LIST "XELP;XEHP;XEHPG;XEHPC;XE2;XE3;XE3P")
     else()
         foreach(isa ${DNNL_ENABLE_PRIMITIVE_GPU_ISA})
             string(TOUPPER ${isa} ISA)