From 12397152f4f861966d12caeeefc4dde97cdb7f35 Mon Sep 17 00:00:00 2001 From: Victor Mustya Date: Wed, 3 Dec 2025 10:40:27 -0800 Subject: [PATCH 1/2] Add support for the cl_intel_subgroup_buffer_prefetch The commit adds support for the cl_intel_subgroup_buffer_prefetch OpenCL extension. The extension introduces a new built-in functions that allow prefetching data from a global memory to caches as a subgroup-level operation. The extension is defined here: https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroup_buffer_prefetch.html --- clang/lib/Headers/opencl-c.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/clang/lib/Headers/opencl-c.h b/clang/lib/Headers/opencl-c.h index f65b4b314cffd..2ea48f2760c76 100644 --- a/clang/lib/Headers/opencl-c.h +++ b/clang/lib/Headers/opencl-c.h @@ -17525,6 +17525,13 @@ void __ovld __conv intel_sub_group_block_write_ui8( __global uint* p, uint #endif // defined(cl_intel_subgroups_char) || defined(cl_intel_subgroups_short) || // defined(cl_intel_subgroups_long) + +#if defined(cl_intel_subgroup_buffer_prefetch) +void __ovld __conv intel_sub_group_block_prefetch_ui(const __global uint *p); +void __ovld __conv intel_sub_group_block_prefetch_ui2(const __global uint *p); +void __ovld __conv intel_sub_group_block_prefetch_ui4(const __global uint *p); +void __ovld __conv intel_sub_group_block_prefetch_ui8(const __global uint *p); +#endif // defined(cl_intel_subgroup_buffer_prefetch) #endif // cl_intel_subgroups #if defined(cl_intel_subgroups_short) @@ -17660,6 +17667,14 @@ void __ovld __conv intel_sub_group_block_write_us2( __global ushort* p, u void __ovld __conv intel_sub_group_block_write_us4( __global ushort* p, ushort4 data ); void __ovld __conv intel_sub_group_block_write_us8( __global ushort* p, ushort8 data ); void __ovld __conv intel_sub_group_block_write_us16( __global ushort* p, ushort16 data ); + +#if defined(cl_intel_subgroup_buffer_prefetch) +void __ovld __conv intel_sub_group_block_prefetch_us(const __global ushort *p); +void __ovld __conv intel_sub_group_block_prefetch_us2(const __global ushort *p); +void __ovld __conv intel_sub_group_block_prefetch_us4(const __global ushort *p); +void __ovld __conv intel_sub_group_block_prefetch_us8(const __global ushort *p); +void __ovld __conv intel_sub_group_block_prefetch_us16(const __global ushort *p); +#endif // defined(cl_intel_subgroup_buffer_prefetch) #endif // cl_intel_subgroups_short #if defined(cl_intel_subgroups_char) @@ -17795,6 +17810,14 @@ void __ovld __conv intel_sub_group_block_write_uc2( __global uchar* p, uc void __ovld __conv intel_sub_group_block_write_uc4( __global uchar* p, uchar4 data ); void __ovld __conv intel_sub_group_block_write_uc8( __global uchar* p, uchar8 data ); void __ovld __conv intel_sub_group_block_write_uc16( __global uchar* p, uchar16 data ); + +#if defined(cl_intel_subgroup_buffer_prefetch) +void __ovld __conv intel_sub_group_block_prefetch_uc(const __global uchar *p); +void __ovld __conv intel_sub_group_block_prefetch_uc2(const __global uchar *p); +void __ovld __conv intel_sub_group_block_prefetch_uc4(const __global uchar *p); +void __ovld __conv intel_sub_group_block_prefetch_uc8(const __global uchar *p); +void __ovld __conv intel_sub_group_block_prefetch_uc16(const __global uchar *p); +#endif // defined(cl_intel_subgroup_buffer_prefetch) #endif // cl_intel_subgroups_char #if defined(cl_intel_subgroups_long) @@ -17839,6 +17862,13 @@ void __ovld __conv intel_sub_group_block_write_ul( __global ulong* p, ul void __ovld __conv intel_sub_group_block_write_ul2( __global ulong* p, ulong2 data ); void __ovld __conv intel_sub_group_block_write_ul4( __global ulong* p, ulong4 data ); void __ovld __conv intel_sub_group_block_write_ul8( __global ulong* p, ulong8 data); + +#if defined(cl_intel_subgroup_buffer_prefetch) +void __ovld __conv intel_sub_group_block_prefetch_ul(const __global ulong *p); +void __ovld __conv intel_sub_group_block_prefetch_ul2(const __global ulong *p); +void __ovld __conv intel_sub_group_block_prefetch_ul4(const __global ulong *p); +void __ovld __conv intel_sub_group_block_prefetch_ul8(const __global ulong *p); +#endif // defined(cl_intel_subgroup_buffer_prefetch) #endif // cl_intel_subgroups_long #if defined(cl_intel_subgroup_local_block_io) From 98a306453f1573ab55e53cc270d638e04fa731bd Mon Sep 17 00:00:00 2001 From: Victor Mustya Date: Thu, 4 Dec 2025 15:12:06 -0800 Subject: [PATCH 2/2] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Mészáros Gergely --- clang/lib/Headers/opencl-c.h | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/clang/lib/Headers/opencl-c.h b/clang/lib/Headers/opencl-c.h index 2ea48f2760c76..f9cdb2e706f01 100644 --- a/clang/lib/Headers/opencl-c.h +++ b/clang/lib/Headers/opencl-c.h @@ -17527,10 +17527,10 @@ void __ovld __conv intel_sub_group_block_write_ui8( __global uint* p, uint // defined(cl_intel_subgroups_long) #if defined(cl_intel_subgroup_buffer_prefetch) -void __ovld __conv intel_sub_group_block_prefetch_ui(const __global uint *p); -void __ovld __conv intel_sub_group_block_prefetch_ui2(const __global uint *p); -void __ovld __conv intel_sub_group_block_prefetch_ui4(const __global uint *p); -void __ovld __conv intel_sub_group_block_prefetch_ui8(const __global uint *p); +void __ovld __conv intel_sub_group_block_prefetch_ui(const __global uint *p); +void __ovld __conv intel_sub_group_block_prefetch_ui2(const __global uint *p); +void __ovld __conv intel_sub_group_block_prefetch_ui4(const __global uint *p); +void __ovld __conv intel_sub_group_block_prefetch_ui8(const __global uint *p); #endif // defined(cl_intel_subgroup_buffer_prefetch) #endif // cl_intel_subgroups @@ -17669,11 +17669,11 @@ void __ovld __conv intel_sub_group_block_write_us8( __global ushort* p, u void __ovld __conv intel_sub_group_block_write_us16( __global ushort* p, ushort16 data ); #if defined(cl_intel_subgroup_buffer_prefetch) -void __ovld __conv intel_sub_group_block_prefetch_us(const __global ushort *p); -void __ovld __conv intel_sub_group_block_prefetch_us2(const __global ushort *p); -void __ovld __conv intel_sub_group_block_prefetch_us4(const __global ushort *p); -void __ovld __conv intel_sub_group_block_prefetch_us8(const __global ushort *p); -void __ovld __conv intel_sub_group_block_prefetch_us16(const __global ushort *p); +void __ovld __conv intel_sub_group_block_prefetch_us(const __global ushort *p); +void __ovld __conv intel_sub_group_block_prefetch_us2(const __global ushort *p); +void __ovld __conv intel_sub_group_block_prefetch_us4(const __global ushort *p); +void __ovld __conv intel_sub_group_block_prefetch_us8(const __global ushort *p); +void __ovld __conv intel_sub_group_block_prefetch_us16(const __global ushort *p); #endif // defined(cl_intel_subgroup_buffer_prefetch) #endif // cl_intel_subgroups_short @@ -17812,11 +17812,11 @@ void __ovld __conv intel_sub_group_block_write_uc8( __global uchar* p, uc void __ovld __conv intel_sub_group_block_write_uc16( __global uchar* p, uchar16 data ); #if defined(cl_intel_subgroup_buffer_prefetch) -void __ovld __conv intel_sub_group_block_prefetch_uc(const __global uchar *p); -void __ovld __conv intel_sub_group_block_prefetch_uc2(const __global uchar *p); -void __ovld __conv intel_sub_group_block_prefetch_uc4(const __global uchar *p); -void __ovld __conv intel_sub_group_block_prefetch_uc8(const __global uchar *p); -void __ovld __conv intel_sub_group_block_prefetch_uc16(const __global uchar *p); +void __ovld __conv intel_sub_group_block_prefetch_uc(const __global uchar *p); +void __ovld __conv intel_sub_group_block_prefetch_uc2(const __global uchar *p); +void __ovld __conv intel_sub_group_block_prefetch_uc4(const __global uchar *p); +void __ovld __conv intel_sub_group_block_prefetch_uc8(const __global uchar *p); +void __ovld __conv intel_sub_group_block_prefetch_uc16(const __global uchar *p); #endif // defined(cl_intel_subgroup_buffer_prefetch) #endif // cl_intel_subgroups_char @@ -17864,10 +17864,10 @@ void __ovld __conv intel_sub_group_block_write_ul4( __global ulong* p, ul void __ovld __conv intel_sub_group_block_write_ul8( __global ulong* p, ulong8 data); #if defined(cl_intel_subgroup_buffer_prefetch) -void __ovld __conv intel_sub_group_block_prefetch_ul(const __global ulong *p); -void __ovld __conv intel_sub_group_block_prefetch_ul2(const __global ulong *p); -void __ovld __conv intel_sub_group_block_prefetch_ul4(const __global ulong *p); -void __ovld __conv intel_sub_group_block_prefetch_ul8(const __global ulong *p); +void __ovld __conv intel_sub_group_block_prefetch_ul(const __global ulong *p); +void __ovld __conv intel_sub_group_block_prefetch_ul2(const __global ulong *p); +void __ovld __conv intel_sub_group_block_prefetch_ul4(const __global ulong *p); +void __ovld __conv intel_sub_group_block_prefetch_ul8(const __global ulong *p); #endif // defined(cl_intel_subgroup_buffer_prefetch) #endif // cl_intel_subgroups_long