Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmake/configuring_primitive_list.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ if (DNNL_ENABLE_PRIMITIVE_GPU_ISA STREQUAL "ALL")
else()
foreach(isa ${DNNL_ENABLE_PRIMITIVE_GPU_ISA})
string(TOUPPER ${isa} uisa)
if(NOT "${uisa}" MATCHES "^(XELP|XEHP|XEHPG|XEHPC|XE2|XE3)$")
if(NOT "${uisa}" MATCHES "^(XELP|XEHP|XEHPG|XEHPC|XE2|XE3|XE3P)$")
message(FATAL_ERROR "Unsupported primitive GPU ISA: ${uisa}")
endif()
set(BUILD_${uisa} TRUE)
Expand Down
2 changes: 1 addition & 1 deletion cmake/options.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ set(DNNL_ENABLE_PRIMITIVE_GPU_ISA "ALL" CACHE STRING
implementations will always be available. Valid values:
- ALL (the default). Includes all ISA to be enabled.
- <ISA_NAME>;<ISA_NAME>;... Includes only selected ISA to be enabled.
Possible values are: XELP, XEHP, XEHPG, XEHPC, XE2, XE3.")
Possible values are: XELP, XEHP, XEHPG, XEHPC, XE2, XE3, XE3P.")

set(ONEDNN_ENABLE_GEMM_KERNELS_ISA "ALL" CACHE STRING
"Specifies an ISA set of GeMM kernels residing in x64/gemm folder to be
Expand Down
1 change: 1 addition & 0 deletions include/oneapi/dnnl/dnnl_config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@
#cmakedefine01 BUILD_XEHPC
#cmakedefine01 BUILD_XE2
#cmakedefine01 BUILD_XE3
#cmakedefine01 BUILD_XE3P
// GeMM kernels ISA controls
#cmakedefine01 BUILD_GEMM_KERNELS_ALL
#cmakedefine01 BUILD_GEMM_KERNELS_NONE
Expand Down
6 changes: 6 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,12 @@ if(UNIX)
endif()
endif()

# TODO: Remove these after the next pull-down from main.
if(DNNL_WITH_XE3P)
add_definitions_with_host_compiler(-DDNNL_WITH_XE3P)
add_definitions_with_host_compiler(-DXE3P)
endif()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Follow TODO?


add_subdirectory(common)

if(NOT DNNL_CPU_RUNTIME STREQUAL "NONE")
Expand Down
6 changes: 4 additions & 2 deletions src/common/float4.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,10 @@ uint8_t float2e3m0(float f) {
min_diff = diff;
raw_bits = idx;
}
// Special case for midpoint, we round to even (so even index)
if ((diff == min_diff) && !(idx & 1)) raw_bits = idx;
// Special case for midpoint:
// - towards 0 for 0.125
// - up for other ties
if ((diff == min_diff) && idx != 1) raw_bits = idx;
}
assert(raw_bits < 8);
// reapply sign
Expand Down
6 changes: 6 additions & 0 deletions src/common/impl_registration.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,4 +239,10 @@
#define REG_XE3_ISA(...)
#endif

#if BUILD_PRIMITIVE_GPU_ISA_ALL || BUILD_XE3P
#define REG_XE3P_ISA(...) __VA_ARGS__
#else
#define REG_XE3P_ISA(...)
#endif

#endif
4 changes: 4 additions & 0 deletions src/gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ file(GLOB SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
)

if(DNNL_WITH_XE3P)
add_definitions_with_host_compiler(-DXE3P=1)
endif()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It feels like this is not needed.


set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu)
add_library(${OBJ_LIB} OBJECT ${SOURCES})
set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
Expand Down
34 changes: 33 additions & 1 deletion src/gpu/intel/compute/device_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ uint64_t get_future_extensions(
case gpu_arch_t::xe2:
case gpu_arch_t::xe_hpc:
case gpu_arch_t::xe3:
case gpu_arch_t::xe3p_35_10:
case gpu_arch_t::xe3p_35_11:
case gpu_arch_t::xe3p_35_unknown:
extensions |= (uint64_t)device_ext_t::intel_global_float_atomics;
extensions
|= (uint64_t)device_ext_t::intel_variable_eu_thread_count;
Expand Down Expand Up @@ -109,7 +112,13 @@ bool device_info_t::mayiuse_sub_group(int size) const {
case gpu_arch_t::xe_lp:
case gpu_arch_t::xe_hp:
case gpu_arch_t::xe_hpg: return utils::one_of(size, 8, 16, 32);
default: return utils::one_of(size, 16, 32);
case gpu_arch_t::xe_hpc:
case gpu_arch_t::xe2:
case gpu_arch_t::xe3:
case gpu_arch_t::xe3p_35_10:
case gpu_arch_t::xe3p_35_11:
case gpu_arch_t::xe3p_35_unknown: return utils::one_of(size, 16, 32);
default: return utils::one_of(size, 32);
}
}

Expand Down Expand Up @@ -145,6 +154,9 @@ int device_info_t::max_eus_per_wg(gpu_arch_t gpu_arch) {
switch (gpu_arch) {
case gpu::intel::compute::gpu_arch_t::xe_hpc:
case gpu::intel::compute::gpu_arch_t::xe2:
case gpu_arch_t::xe3p_35_10:
case gpu_arch_t::xe3p_35_11:
case gpu_arch_t::xe3p_35_unknown:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: same style for prefixes here and one below?

case gpu::intel::compute::gpu_arch_t::xe3: return 8;
case gpu::intel::compute::gpu_arch_t::xe_lp:
case gpu::intel::compute::gpu_arch_t::xe_hp:
Expand All @@ -158,6 +170,9 @@ int device_info_t::max_subgroup_size(gpu_arch_t gpu_arch) {
switch (gpu_arch) {
case gpu::intel::compute::gpu_arch_t::xe_hpc:
case gpu::intel::compute::gpu_arch_t::xe2:
case gpu_arch_t::xe3p_35_10:
case gpu_arch_t::xe3p_35_11:
case gpu_arch_t::xe3p_35_unknown:
case gpu::intel::compute::gpu_arch_t::xe3: return 32;
case gpu::intel::compute::gpu_arch_t::xe_lp:
case gpu::intel::compute::gpu_arch_t::xe_hp:
Expand All @@ -179,6 +194,9 @@ int device_info_t::min_subgroup_size() const {
case gpu_arch_t::xe_hpg: return 8;
case gpu_arch_t::xe_hpc:
case gpu_arch_t::xe2:
case gpu_arch_t::xe3p_35_10:
case gpu_arch_t::xe3p_35_11:
case gpu_arch_t::xe3p_35_unknown:
case gpu_arch_t::xe3: return 16;
default: return 0;
}
Expand All @@ -188,6 +206,9 @@ int device_info_t::max_exec_size(gpu_arch_t gpu_arch) {
switch (gpu_arch) {
case gpu::intel::compute::gpu_arch_t::xe_hpc:
case gpu::intel::compute::gpu_arch_t::xe2:
case gpu::intel::compute::gpu_arch_t::xe3p_35_10:
case gpu::intel::compute::gpu_arch_t::xe3p_35_11:
case gpu::intel::compute::gpu_arch_t::xe3p_35_unknown:
case gpu::intel::compute::gpu_arch_t::xe3: return 128;
default: return 64;
}
Expand Down Expand Up @@ -221,6 +242,9 @@ int device_info_t::threads_per_eu(gpu_arch_t gpu_arch, bool large_grf_mode) {
case gpu::intel::compute::gpu_arch_t::xe_hpg:
case gpu::intel::compute::gpu_arch_t::xe_hpc:
case gpu::intel::compute::gpu_arch_t::xe2:
case gpu::intel::compute::gpu_arch_t::xe3p_35_10:
case gpu::intel::compute::gpu_arch_t::xe3p_35_11:
case gpu::intel::compute::gpu_arch_t::xe3p_35_unknown:
case gpu::intel::compute::gpu_arch_t::xe3:
return large_grf_mode ? 4 : 8;
case gpu::intel::compute::gpu_arch_t::unknown: return 7;
Expand All @@ -238,6 +262,11 @@ int device_info_t::max_slm_size(gpu_arch_t gpu_arch) {
case gpu::intel::compute::gpu_arch_t::xe_hpg:
case gpu::intel::compute::gpu_arch_t::xe_hpc:
case gpu::intel::compute::gpu_arch_t::xe2:
case gpu::intel::compute::gpu_arch_t::xe3p_35_10:
case gpu::intel::compute::gpu_arch_t::xe3p_35_11:
case gpu::intel::compute::gpu_arch_t::xe3p_35_unknown:
slm_size = 3 * (1 << 17);
break;
case gpu::intel::compute::gpu_arch_t::xe3: slm_size = (1 << 17); break;
case gpu::intel::compute::gpu_arch_t::unknown: assert(!"not expected");
}
Expand Down Expand Up @@ -269,6 +298,9 @@ size_t device_info_t::icache_size() const {
case gpu::intel::compute::gpu_arch_t::xe_hpc: return 80 * 1024;
case gpu::intel::compute::gpu_arch_t::xe2: return 96 * 1024;
case gpu::intel::compute::gpu_arch_t::xe3: return 96 * 1024;
case gpu::intel::compute::gpu_arch_t::xe3p_35_10:
case gpu::intel::compute::gpu_arch_t::xe3p_35_11:
case gpu::intel::compute::gpu_arch_t::xe3p_35_unknown: return 80 * 1024;
case gpu::intel::compute::gpu_arch_t::unknown: assert(!"not expected");
}
return 0;
Expand Down
22 changes: 21 additions & 1 deletion src/gpu/intel/compute/device_info.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,18 @@ namespace gpu {
namespace intel {
namespace compute {

enum class gpu_arch_t { unknown, xe_lp, xe_hp, xe_hpg, xe_hpc, xe2, xe3 };
enum class gpu_arch_t {
unknown,
xe_lp,
xe_hp,
xe_hpg,
xe_hpc,
xe2,
xe3,
xe3p_35_10,
xe3p_35_11,
xe3p_35_unknown,
};

// Memory for storing ngen::Product to avoid directly including nGEN because of
// header dependencies outside of src/gpu/intel.
Expand All @@ -58,6 +69,9 @@ static inline const char *to_string(gpu_arch_t arch) {
CASE(xe_hpc);
CASE(xe2);
CASE(xe3);
CASE(xe3p_35_10);
CASE(xe3p_35_11);
CASE(xe3p_35_unknown);
return "unknown";
#undef CASE
}
Expand All @@ -71,6 +85,9 @@ static inline gpu_arch_t str2gpu_arch(const char *str) {
CASE(xe_hpc);
CASE(xe2);
CASE(xe3);
CASE(xe3p_35_10);
CASE(xe3p_35_11);
CASE(xe3p_35_unknown);
return gpu_arch_t::unknown;
#undef CASE
}
Expand Down Expand Up @@ -253,6 +270,8 @@ struct device_info_t {

bool has_native(data_type_t type) const;

bool is_efficient_64bit() const { return is_efficient_64bit_; }

const std::vector<uint8_t> &get_cache_blob() const {
return serialized_device_info_.get_data();
}
Expand Down Expand Up @@ -282,6 +301,7 @@ struct device_info_t {
bool mayiuse_systolic_ = false;
bool mayiuse_ngen_kernels_ = false;
bool mayiuse_system_memory_allocators_ = false;
bool is_efficient_64bit_ = false;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be nice to have a comment explaining what is efficient 64 bit...


std::string name_;
xpu::runtime_version_t runtime_version_;
Expand Down
2 changes: 2 additions & 0 deletions src/gpu/intel/conv/jit/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1094,6 +1094,8 @@ status_t init_vec_size(config_t &cfg) {

int default_regs(const config_t &cfg) {
if (!cfg.hw().large_grf_support()) return 128;
if (cfg.hw() == ngen::HW::XE3P_35_11 && cfg.is_dpas_or_dpasw_fma())
return 512;
if (cfg.is_dpas_or_dpasw_fma()) return 256;
return 128;
}
Expand Down
3 changes: 3 additions & 0 deletions src/gpu/intel/conv/jit/ir_builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,9 @@ class compute_builder_t {
alloc_updater.update(buf_mgr_);
}

// Assign {Fwd} for dpas when applicable.
if (cfg_.hw() >= ngen::HW::XE3P_35_10)
x2r_mul_stmt_ = inject_dpas_fwd(x2r_mul_stmt_);
// Assign {Atomic} for dpas(w) when applicable.
x2r_mul_stmt_ = inject_dpas_atomic(x2r_mul_stmt_);
}
Expand Down
3 changes: 3 additions & 0 deletions src/gpu/intel/conv/jit/model_bridge.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ hw_t to_hw(ngen::HW hw) {
case ngen::HW::XeHPC: return hw_t::xehpc;
case ngen::HW::Xe2: return hw_t::xehpc;
case ngen::HW::Xe3: return hw_t::xehpc;
case ngen::HW::XE3P_35_10:
case ngen::HW::XE3P_35_11:
case ngen::HW::XE3P_UNKNOWN: return hw_t::xehpc;
default: gpu_error_not_expected() << "Unknown HW: " << to_string(hw);
}
return hw_t::undef;
Expand Down
43 changes: 42 additions & 1 deletion src/gpu/intel/conv/jit/plan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1355,7 +1355,8 @@ struct fma_context_t {
bool is_dpas = is_dp_fma(fma);
bool is_a = (abc == abc_kind_t::a);
auto type = (is_a ? a_type : b_type);
bool cvt_f16 = (layout.type().is_fp8() || layout.type().is_fp4());
bool cvt_f16 = ((hw < ngen::HW::XE3P_35_10 && layout.type().is_fp8())
|| (hw < ngen::HW::XE3P_35_11 && layout.type().is_fp4()));
int type_size = (cvt_f16 ? 2 : type.size());
if (is_dpas) {
int sdepth = 8;
Expand Down Expand Up @@ -2209,11 +2210,51 @@ class plan_builder_t {
return plan_status_t::success;
}

// Extends the view to cover 256 contiguous bytes for more efficient
// prefetching.
void maybe_extend_prefetch_thread_view_to_256_bytes(
view_t &thr_view) const {
auto thr_layout = thr_view.create_pseudo_vlayout();
auto &blocks = thr_layout.blocks();
if (blocks.size() <= 1) return;

auto &b0 = blocks[0];
auto &b1 = blocks[1];
if (!b1.stride.is_fixed() || !b0.stride.is_fixed()) return;
auto inner_var = thr_view.vvars()[b0.idx];
bool is_block_strided
= (b0.stride == stride_t(1)) && (b1.stride > b0.size);
int type_size = thr_layout.type().size();
dim_t full_dim_size
= gemm_schedule_.a_view().vdims()[b0.idx] * type_size;
bool size_ge_256b = (full_dim_size >= 256);
dim_t b0_size = b0.size * type_size;
bool prefetch_lt_256b = (b0_size < 256);
bool is_inner_loop = gemm_schedule_.is_inner_loop(inner_var);
// Extend if the following conditions are satisfied:
// - The inner block (b0) is dense and smaller than 256 bytes
// - The original tensor has at least 256 bytes across b0 dimension
// - The inner block dimensions corresponds to the inner loop
// dimension. We want to prefetch extra cache lines only if they are
// going to be used by the next iterations.
if (is_block_strided && size_ge_256b && prefetch_lt_256b
&& is_inner_loop) {
gpu_assert(thr_view.vdims()[b0.idx] == b0.size);
int factor = 256 / b0_size;
thr_view.set_vdim(inner_var, b0.size * factor,
thr_view.vstart()[b0.idx],
/*overwrite=*/true);
}
}

plan_status_t init_x_prefetch_plan(abc_kind_t abc, const view_t &tg_view,
grid_info_t &grid, send_plan_t &prefetch) const {
if (!use_prefetch(abc)) return plan_status_t::success;
auto &tg = cfg_.thread_group_grid();
auto thr_view = tg_view.split(tg, &grid);
if (cfg_.hw() == ngen::HW::XE3P_35_11) {
maybe_extend_prefetch_thread_view_to_256_bytes(thr_view);
}
auto params = get_send_params(cfg_.options(), send_op_t::prefetch,
send_address_t::a64, fma_kind_t::undef, abc, thr_view,
gemm_schedule_);
Expand Down
6 changes: 5 additions & 1 deletion src/gpu/intel/gemm/jit.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ struct gen_t : public primitive_t {
// Check GPU architecture.
bool arch_ok = utils::one_of(arch_, arch_t::xe_lp, arch_t::xe_hp,
arch_t::xe_hpg, arch_t::xe_hpc, arch_t::xe2, arch_t::xe3);
arch_ok |= (arch_ >= arch_t::xe3p_35_10);

VDISPATCH_GEMM(arch_ok, VERBOSE_UNSUPPORTED_ARCH, "gpu");
VDISPATCH_GEMM(IMPLICATION(with_binary, arch_ >= arch_t::xe_hp),
Expand All @@ -215,7 +216,7 @@ struct gen_t : public primitive_t {
|| intel_engine->mayiuse(compute::device_ext_t::
intel_subgroup_split_matrix_multiply_accumulate);

bool is_integrated = intel_engine->device_info()->is_integrated();
bool is_integrated = dev_info_->is_integrated();

// Size checks for fused reduction kernels.
if (with_sum_ab()) {
Expand Down Expand Up @@ -260,6 +261,9 @@ struct gen_t : public primitive_t {
!with_eltwise && !with_binary),
VERBOSE_UNSUPPORTED_POSTOP);

if (arch_ >= arch_t::xe3p_35_10)
kernel_desc_.set_efficient_64b(dev_info_->is_efficient_64bit());

bool print_verbose = get_verbose(verbose_t::debuginfo) >= 5;
bool kernel_success = false;
auto lda = ld(DNNL_ARG_A);
Expand Down
7 changes: 4 additions & 3 deletions src/gpu/intel/gemm/jit/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,16 @@ endif()

# Use oneDNN names for ALL to ensure string replacement functions correctly
set(GPUS ${DNNL_ENABLE_PRIMITIVE_GPU_ISA})
string(REPLACE "ALL" "XELP;XEHP;XEHPG;XEHPC;XE2;XE3" GPUS "${GPUS}")
string(REPLACE "ALL" "XELP;XEHP;XEHPG;XEHPC;XE2;XE3;XE3P" GPUS "${GPUS}")
string(REPLACE "XELP" "12LP" GPUS "${GPUS}")
string(REPLACE "XEHPG" "12p7" GPUS "${GPUS}")
string(REPLACE "XEHPC" "12p8" GPUS "${GPUS}")
string(REPLACE "XEHP" "12HP" GPUS "${GPUS}")
string(REPLACE "XE2" "Xe2" GPUS "${GPUS}")
string(REPLACE "XE3" "Xe3" GPUS "${GPUS}")
string(REPLACE "XE3P" "Xe3P" GPUS "${GPUS}")

set(ALL_GPUS "12LP;12HP;12p7;12p8;Xe2;Xe3")
set(ALL_GPUS "12LP;12HP;12p7;12p8;Xe2;Xe3;Xe3P")
foreach(GPU ${GPUS})
if(NOT ${GPU} IN_LIST ALL_GPUS)
message(FATAL_ERROR "Unknown GPU architecture: ${GPU}")
Expand Down Expand Up @@ -63,7 +64,7 @@ if(DPCPP_HOST_COMPILER_KIND STREQUAL "DEFAULT")
)

if (DNNL_ENABLE_PRIMITIVE_GPU_ISA STREQUAL "ALL")
set(DNNL_GPU_ISA_LIST "XELP;XEHP;XEHPG;XEHPC;XE2;XE3")
set(DNNL_GPU_ISA_LIST "XELP;XEHP;XEHPG;XEHPC;XE2;XE3;XE3P")
else()
foreach(isa ${DNNL_ENABLE_PRIMITIVE_GPU_ISA})
string(TOUPPER ${isa} ISA)
Expand Down
Loading
Loading