Skip to content

Commit b6e2770

Browse files
committed
gpu: backport xe3p
1 parent aa90c2d commit b6e2770

File tree

119 files changed

+4249
-435
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

119 files changed

+4249
-435
lines changed

cmake/configuring_primitive_list.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ if (DNNL_ENABLE_PRIMITIVE_GPU_ISA STREQUAL "ALL")
5858
else()
5959
foreach(isa ${DNNL_ENABLE_PRIMITIVE_GPU_ISA})
6060
string(TOUPPER ${isa} uisa)
61-
if(NOT "${uisa}" MATCHES "^(XELP|XEHP|XEHPG|XEHPC|XE2|XE3)$")
61+
if(NOT "${uisa}" MATCHES "^(XELP|XEHP|XEHPG|XEHPC|XE2|XE3|XE3P)$")
6262
message(FATAL_ERROR "Unsupported primitive GPU ISA: ${uisa}")
6363
endif()
6464
set(BUILD_${uisa} TRUE)

cmake/options.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ set(DNNL_ENABLE_PRIMITIVE_GPU_ISA "ALL" CACHE STRING
151151
implementations will always be available. Valid values:
152152
- ALL (the default). Includes all ISA to be enabled.
153153
- <ISA_NAME>;<ISA_NAME>;... Includes only selected ISA to be enabled.
154-
Possible values are: XELP, XEHP, XEHPG, XEHPC, XE2, XE3.")
154+
Possible values are: XELP, XEHP, XEHPG, XEHPC, XE2, XE3, XE3P.")
155155

156156
set(ONEDNN_ENABLE_GEMM_KERNELS_ISA "ALL" CACHE STRING
157157
"Specifies an ISA set of GeMM kernels residing in x64/gemm folder to be

include/oneapi/dnnl/dnnl_config.h.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,7 @@
227227
#cmakedefine01 BUILD_XEHPC
228228
#cmakedefine01 BUILD_XE2
229229
#cmakedefine01 BUILD_XE3
230+
#cmakedefine01 BUILD_XE3P
230231
// GeMM kernels ISA controls
231232
#cmakedefine01 BUILD_GEMM_KERNELS_ALL
232233
#cmakedefine01 BUILD_GEMM_KERNELS_NONE

src/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,12 @@ if(UNIX)
125125
endif()
126126
endif()
127127

128+
# TODO: Remove these after the next pull-down from main.
129+
if(DNNL_WITH_XE3P)
130+
add_definitions_with_host_compiler(-DDNNL_WITH_XE3P)
131+
add_definitions_with_host_compiler(-DXE3P)
132+
endif()
133+
128134
add_subdirectory(common)
129135

130136
if(NOT DNNL_CPU_RUNTIME STREQUAL "NONE")

src/common/float4.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,8 +113,10 @@ uint8_t float2e3m0(float f) {
113113
min_diff = diff;
114114
raw_bits = idx;
115115
}
116-
// Special case for midpoint, we round to even (so even index)
117-
if ((diff == min_diff) && !(idx & 1)) raw_bits = idx;
116+
// Special case for midpoint:
117+
// - towards 0 for 0.125
118+
// - up for other ties
119+
if ((diff == min_diff) && idx != 1) raw_bits = idx;
118120
}
119121
assert(raw_bits < 8);
120122
// reapply sign

src/common/impl_registration.hpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,4 +239,10 @@
239239
#define REG_XE3_ISA(...)
240240
#endif
241241

242+
#if BUILD_PRIMITIVE_GPU_ISA_ALL || BUILD_XE3P
243+
#define REG_XE3P_ISA(...) __VA_ARGS__
244+
#else
245+
#define REG_XE3P_ISA(...)
246+
#endif
247+
242248
#endif

src/gpu/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@ file(GLOB SOURCES
1919
${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
2020
)
2121

22+
if(DNNL_WITH_XE3P)
23+
add_definitions_with_host_compiler(-DXE3P=1)
24+
endif()
25+
2226
set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu)
2327
add_library(${OBJ_LIB} OBJECT ${SOURCES})
2428
set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS

src/gpu/intel/compute/device_info.cpp

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@ uint64_t get_future_extensions(
4545
case gpu_arch_t::xe2:
4646
case gpu_arch_t::xe_hpc:
4747
case gpu_arch_t::xe3:
48+
case gpu_arch_t::xe3p_35_10:
49+
case gpu_arch_t::xe3p_35_11:
50+
case gpu_arch_t::xe3p_35_unknown:
4851
extensions |= (uint64_t)device_ext_t::intel_global_float_atomics;
4952
extensions
5053
|= (uint64_t)device_ext_t::intel_variable_eu_thread_count;
@@ -109,7 +112,13 @@ bool device_info_t::mayiuse_sub_group(int size) const {
109112
case gpu_arch_t::xe_lp:
110113
case gpu_arch_t::xe_hp:
111114
case gpu_arch_t::xe_hpg: return utils::one_of(size, 8, 16, 32);
112-
default: return utils::one_of(size, 16, 32);
115+
case gpu_arch_t::xe_hpc:
116+
case gpu_arch_t::xe2:
117+
case gpu_arch_t::xe3:
118+
case gpu_arch_t::xe3p_35_10:
119+
case gpu_arch_t::xe3p_35_11:
120+
case gpu_arch_t::xe3p_35_unknown: return utils::one_of(size, 16, 32);
121+
default: return utils::one_of(size, 32);
113122
}
114123
}
115124

@@ -145,6 +154,9 @@ int device_info_t::max_eus_per_wg(gpu_arch_t gpu_arch) {
145154
switch (gpu_arch) {
146155
case gpu::intel::compute::gpu_arch_t::xe_hpc:
147156
case gpu::intel::compute::gpu_arch_t::xe2:
157+
case gpu_arch_t::xe3p_35_10:
158+
case gpu_arch_t::xe3p_35_11:
159+
case gpu_arch_t::xe3p_35_unknown:
148160
case gpu::intel::compute::gpu_arch_t::xe3: return 8;
149161
case gpu::intel::compute::gpu_arch_t::xe_lp:
150162
case gpu::intel::compute::gpu_arch_t::xe_hp:
@@ -158,6 +170,9 @@ int device_info_t::max_subgroup_size(gpu_arch_t gpu_arch) {
158170
switch (gpu_arch) {
159171
case gpu::intel::compute::gpu_arch_t::xe_hpc:
160172
case gpu::intel::compute::gpu_arch_t::xe2:
173+
case gpu_arch_t::xe3p_35_10:
174+
case gpu_arch_t::xe3p_35_11:
175+
case gpu_arch_t::xe3p_35_unknown:
161176
case gpu::intel::compute::gpu_arch_t::xe3: return 32;
162177
case gpu::intel::compute::gpu_arch_t::xe_lp:
163178
case gpu::intel::compute::gpu_arch_t::xe_hp:
@@ -179,6 +194,9 @@ int device_info_t::min_subgroup_size() const {
179194
case gpu_arch_t::xe_hpg: return 8;
180195
case gpu_arch_t::xe_hpc:
181196
case gpu_arch_t::xe2:
197+
case gpu_arch_t::xe3p_35_10:
198+
case gpu_arch_t::xe3p_35_11:
199+
case gpu_arch_t::xe3p_35_unknown:
182200
case gpu_arch_t::xe3: return 16;
183201
default: return 0;
184202
}
@@ -188,6 +206,9 @@ int device_info_t::max_exec_size(gpu_arch_t gpu_arch) {
188206
switch (gpu_arch) {
189207
case gpu::intel::compute::gpu_arch_t::xe_hpc:
190208
case gpu::intel::compute::gpu_arch_t::xe2:
209+
case gpu::intel::compute::gpu_arch_t::xe3p_35_10:
210+
case gpu::intel::compute::gpu_arch_t::xe3p_35_11:
211+
case gpu::intel::compute::gpu_arch_t::xe3p_35_unknown:
191212
case gpu::intel::compute::gpu_arch_t::xe3: return 128;
192213
default: return 64;
193214
}
@@ -221,6 +242,9 @@ int device_info_t::threads_per_eu(gpu_arch_t gpu_arch, bool large_grf_mode) {
221242
case gpu::intel::compute::gpu_arch_t::xe_hpg:
222243
case gpu::intel::compute::gpu_arch_t::xe_hpc:
223244
case gpu::intel::compute::gpu_arch_t::xe2:
245+
case gpu::intel::compute::gpu_arch_t::xe3p_35_10:
246+
case gpu::intel::compute::gpu_arch_t::xe3p_35_11:
247+
case gpu::intel::compute::gpu_arch_t::xe3p_35_unknown:
224248
case gpu::intel::compute::gpu_arch_t::xe3:
225249
return large_grf_mode ? 4 : 8;
226250
case gpu::intel::compute::gpu_arch_t::unknown: return 7;
@@ -238,6 +262,11 @@ int device_info_t::max_slm_size(gpu_arch_t gpu_arch) {
238262
case gpu::intel::compute::gpu_arch_t::xe_hpg:
239263
case gpu::intel::compute::gpu_arch_t::xe_hpc:
240264
case gpu::intel::compute::gpu_arch_t::xe2:
265+
case gpu::intel::compute::gpu_arch_t::xe3p_35_10:
266+
case gpu::intel::compute::gpu_arch_t::xe3p_35_11:
267+
case gpu::intel::compute::gpu_arch_t::xe3p_35_unknown:
268+
slm_size = 3 * (1 << 17);
269+
break;
241270
case gpu::intel::compute::gpu_arch_t::xe3: slm_size = (1 << 17); break;
242271
case gpu::intel::compute::gpu_arch_t::unknown: assert(!"not expected");
243272
}
@@ -269,6 +298,9 @@ size_t device_info_t::icache_size() const {
269298
case gpu::intel::compute::gpu_arch_t::xe_hpc: return 80 * 1024;
270299
case gpu::intel::compute::gpu_arch_t::xe2: return 96 * 1024;
271300
case gpu::intel::compute::gpu_arch_t::xe3: return 96 * 1024;
301+
case gpu::intel::compute::gpu_arch_t::xe3p_35_10:
302+
case gpu::intel::compute::gpu_arch_t::xe3p_35_11:
303+
case gpu::intel::compute::gpu_arch_t::xe3p_35_unknown: return 80 * 1024;
272304
case gpu::intel::compute::gpu_arch_t::unknown: assert(!"not expected");
273305
}
274306
return 0;

src/gpu/intel/compute/device_info.hpp

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,18 @@ namespace gpu {
4141
namespace intel {
4242
namespace compute {
4343

44-
enum class gpu_arch_t { unknown, xe_lp, xe_hp, xe_hpg, xe_hpc, xe2, xe3 };
44+
enum class gpu_arch_t {
45+
unknown,
46+
xe_lp,
47+
xe_hp,
48+
xe_hpg,
49+
xe_hpc,
50+
xe2,
51+
xe3,
52+
xe3p_35_10,
53+
xe3p_35_11,
54+
xe3p_35_unknown,
55+
};
4556

4657
// Memory for storing ngen::Product to avoid directly including nGEN because of
4758
// header dependencies outside of src/gpu/intel.
@@ -58,6 +69,9 @@ static inline const char *to_string(gpu_arch_t arch) {
5869
CASE(xe_hpc);
5970
CASE(xe2);
6071
CASE(xe3);
72+
CASE(xe3p_35_10);
73+
CASE(xe3p_35_11);
74+
CASE(xe3p_35_unknown);
6175
return "unknown";
6276
#undef CASE
6377
}
@@ -71,6 +85,9 @@ static inline gpu_arch_t str2gpu_arch(const char *str) {
7185
CASE(xe_hpc);
7286
CASE(xe2);
7387
CASE(xe3);
88+
CASE(xe3p_35_10);
89+
CASE(xe3p_35_11);
90+
CASE(xe3p_35_unknown);
7491
return gpu_arch_t::unknown;
7592
#undef CASE
7693
}
@@ -253,6 +270,8 @@ struct device_info_t {
253270

254271
bool has_native(data_type_t type) const;
255272

273+
bool is_efficient_64bit() const { return is_efficient_64bit_; }
274+
256275
const std::vector<uint8_t> &get_cache_blob() const {
257276
return serialized_device_info_.get_data();
258277
}
@@ -282,6 +301,7 @@ struct device_info_t {
282301
bool mayiuse_systolic_ = false;
283302
bool mayiuse_ngen_kernels_ = false;
284303
bool mayiuse_system_memory_allocators_ = false;
304+
bool is_efficient_64bit_ = false;
285305

286306
std::string name_;
287307
xpu::runtime_version_t runtime_version_;

src/gpu/intel/conv/jit/config.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1094,6 +1094,8 @@ status_t init_vec_size(config_t &cfg) {
10941094

10951095
int default_regs(const config_t &cfg) {
10961096
if (!cfg.hw().large_grf_support()) return 128;
1097+
if (cfg.hw() == ngen::HW::XE3P_35_11 && cfg.is_dpas_or_dpasw_fma())
1098+
return 512;
10971099
if (cfg.is_dpas_or_dpasw_fma()) return 256;
10981100
return 128;
10991101
}

0 commit comments

Comments
 (0)