From 77022d1836192bf722dbe5096d88b1459484a425 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Thu, 16 Oct 2025 22:13:16 -0700 Subject: [PATCH 1/2] ET Restrict macro --- backends/cuda/runtime/shims/int4mm.cuh | 12 +++--- backends/xnnpack/runtime/utils/utils.cpp | 8 ++-- backends/xnnpack/runtime/utils/utils.h | 8 ++-- kernels/optimized/cpu/op_log_softmax.cpp | 4 +- kernels/portable/cpu/op_tril.cpp | 8 ++-- kernels/portable/cpu/vec_ops.h | 51 ++++++++++++------------ runtime/platform/compiler.h | 7 ++++ 7 files changed, 53 insertions(+), 45 deletions(-) diff --git a/backends/cuda/runtime/shims/int4mm.cuh b/backends/cuda/runtime/shims/int4mm.cuh index 7f756d7c831..ee12fb51004 100644 --- a/backends/cuda/runtime/shims/int4mm.cuh +++ b/backends/cuda/runtime/shims/int4mm.cuh @@ -501,12 +501,12 @@ struct BLayout_TC_int4 { // 2 k-tiles packed is a uint32 (hence InnerKTiles == 2 is our smallest // value) 4 k-tiles packed is a uint32x2 (64 bits) 8 k-tiles packed is a // uint32x4 (128 bits) - const void* __restrict__ B, + const void* ET_RESTRICT B, // size [k / qGroupSize][n][2] // Contains the scale and zero point of each of the quantized int4 values // within B // v_reconstructed = (bf16(B_int4_val) * scale) - zero - const void* __restrict__ quantizationInfo, + const void* ET_RESTRICT quantizationInfo, int32_t n, int32_t k, int32_t nTiles, @@ -643,16 +643,16 @@ template < __global__ __launch_bounds__(Warps* kWarpSize) void tinygemm_m16n8k16_chunk_kernel( // Data for the A matrix, loaded as per ALayout - const void* const __restrict__ A, + const void* const ET_RESTRICT A, // Data for the B matrix, loaded as per BLayout - const void* const __restrict__ B, + const void* const ET_RESTRICT B, // Optional quantization data for dequantizing B, loaded as per BLayout - const void* const __restrict__ B_quantizationInfo, + const void* const ET_RESTRICT B_quantizationInfo, // Output data for the C matrix, stored as per CLayout - void* __restrict__ C, + void* ET_RESTRICT C, // The size of the matrix multiplication int32_t m, diff --git a/backends/xnnpack/runtime/utils/utils.cpp b/backends/xnnpack/runtime/utils/utils.cpp index bbcb8bc071c..0e017df978b 100644 --- a/backends/xnnpack/runtime/utils/utils.cpp +++ b/backends/xnnpack/runtime/utils/utils.cpp @@ -206,8 +206,8 @@ void vst1(int8_t* out, int8x8_t vout) { template <> void quantize_tensor_arm64_q8_wrapper( - const float* __restrict__ in, - uint8_t* __restrict__ out, + const float* ET_RESTRICT in, + uint8_t* ET_RESTRICT out, const int64_t N, const float scale, const int32_t zero_point) { @@ -216,8 +216,8 @@ void quantize_tensor_arm64_q8_wrapper( template <> void quantize_tensor_arm64_q8_wrapper( - const float* __restrict__ in, - int8_t* __restrict__ out, + const float* ET_RESTRICT in, + int8_t* ET_RESTRICT out, const int64_t N, const float scale, const int32_t zero_point) { diff --git a/backends/xnnpack/runtime/utils/utils.h b/backends/xnnpack/runtime/utils/utils.h index 2eb079f0b0c..de8ee7970dd 100644 --- a/backends/xnnpack/runtime/utils/utils.h +++ b/backends/xnnpack/runtime/utils/utils.h @@ -82,8 +82,8 @@ void vst1(T* out, Tx8 vout); template void quantize_tensor_arm64_q8( - const float* __restrict__ in, - underlying_t* __restrict__ out, + const float* ET_RESTRICT in, + underlying_t* ET_RESTRICT out, const int64_t N, const float scale, const int32_t zero_point) { @@ -117,8 +117,8 @@ void quantize_tensor_arm64_q8( template void quantize_tensor_arm64_q8_wrapper( - const float* __restrict__ in, - T* __restrict__ out, + const float* ET_RESTRICT in, + T* ET_RESTRICT out, const int64_t N, const float scale, const int32_t zero_point); diff --git a/kernels/optimized/cpu/op_log_softmax.cpp b/kernels/optimized/cpu/op_log_softmax.cpp index f56b0a37de2..629a81a6429 100644 --- a/kernels/optimized/cpu/op_log_softmax.cpp +++ b/kernels/optimized/cpu/op_log_softmax.cpp @@ -31,8 +31,8 @@ namespace { template void log_softmax_kernel(const Tensor& input, int64_t dim, Tensor& out) { - const IN_T* __restrict__ input_data_base = input.const_data_ptr(); - OUT_T* __restrict__ output_data_base = out.mutable_data_ptr(); + const IN_T* ET_RESTRICT input_data_base = input.const_data_ptr(); + OUT_T* ET_RESTRICT output_data_base = out.mutable_data_ptr(); if (input.dim() == 0) { output_data_base[0] = 0; diff --git a/kernels/portable/cpu/op_tril.cpp b/kernels/portable/cpu/op_tril.cpp index b21c9918a99..5fb43928883 100644 --- a/kernels/portable/cpu/op_tril.cpp +++ b/kernels/portable/cpu/op_tril.cpp @@ -38,8 +38,8 @@ Tensor& clear_out(Tensor& out) { */ template void apply_tril( - CTYPE* __restrict__ self, - CTYPE* __restrict__ out, + CTYPE* ET_RESTRICT self, + CTYPE* ET_RESTRICT out, int64_t diagonal, int64_t num_rows, int64_t num_cols, @@ -104,8 +104,8 @@ void tril_kernel( int64_t col_stride = strides_ref[ndim - 1]; for (const auto i : c10::irange(batch_size)) { - CTYPE* __restrict__ data_self_ptr = &data_self[i * self_stride]; - CTYPE* __restrict__ data_out_ptr = &data_out[i * self_stride]; + CTYPE* ET_RESTRICT data_self_ptr = &data_self[i * self_stride]; + CTYPE* ET_RESTRICT data_out_ptr = &data_out[i * self_stride]; apply_tril( data_self_ptr, diff --git a/kernels/portable/cpu/vec_ops.h b/kernels/portable/cpu/vec_ops.h index 7a1a488701b..87dd05ac7d4 100644 --- a/kernels/portable/cpu/vec_ops.h +++ b/kernels/portable/cpu/vec_ops.h @@ -10,6 +10,7 @@ #pragma once #include +#include #include #include #include @@ -45,9 +46,9 @@ inline float vec_maxf(const float* x, size_t size) { /// Add each element of `x` and `y` into the corresponding element of `z`. All /// arrays must have `size` elements. inline void vec_addf( - float* __restrict__ z, - const float* __restrict__ x, - const float* __restrict__ y, + float* ET_RESTRICT z, + const float* ET_RESTRICT x, + const float* ET_RESTRICT y, size_t size) { for (const auto i : c10::irange(size)) { z[i] = x[i] + y[i]; @@ -57,8 +58,8 @@ inline void vec_addf( /// Multiplies every element of `x` by `scale`, and writes the result into the /// corresponding element of `y`. `x` and `y` must have `size` elements. inline void vec_scalef( - float* __restrict__ y, - const float* __restrict__ x, + float* ET_RESTRICT y, + const float* ET_RESTRICT x, float scale, size_t size) { for (const auto i : c10::irange(size)) { @@ -70,9 +71,9 @@ inline void vec_scalef( /// z[i][j] = sum(x[i][k] * y[k][j]) template inline void vec_matmul( - T* __restrict__ z, - const U* __restrict__ x, - const U* __restrict__ y, + T* ET_RESTRICT z, + const U* ET_RESTRICT x, + const U* ET_RESTRICT y, int64_t m, int64_t n, int64_t p) { @@ -89,10 +90,10 @@ inline void vec_matmul( template inline void vec_quantized_matmul_int8( - T* __restrict__ z, - const U* __restrict__ x, - const int8_t* __restrict__ y, - const U* __restrict__ s, + T* ET_RESTRICT z, + const U* ET_RESTRICT x, + const int8_t* ET_RESTRICT y, + const U* ET_RESTRICT s, int64_t m, int64_t n, int64_t p) { @@ -115,10 +116,10 @@ static inline size_t bounds_min(size_t a, size_t b) { /// z[i][j] = sum(x[i][k] * y[j][k] * s[j][k/g]) template inline void vec_quantized_matmul_transb_int8( - T* __restrict__ z, - const U* __restrict__ x, - const int8_t* __restrict__ y, - const V* __restrict__ s, + T* ET_RESTRICT z, + const U* ET_RESTRICT x, + const int8_t* ET_RESTRICT y, + const V* ET_RESTRICT s, int64_t m, int64_t n, int64_t p, @@ -146,10 +147,10 @@ inline void vec_quantized_matmul_transb_int8( // T for tensor dtype, U for scalar type template inline void vec_addmm( - T* __restrict__ out_data, - const T* __restrict__ self_data, - const T* __restrict__ mat1_data, - const T* __restrict__ mat2_data, + T* ET_RESTRICT out_data, + const T* ET_RESTRICT self_data, + const T* ET_RESTRICT mat1_data, + const T* ET_RESTRICT mat2_data, int64_t m, int64_t n, int64_t p, @@ -195,7 +196,7 @@ template < typename checkU = typename std::enable_if< std::is_same::type>::value || std::is_same::type>::value>::type> -inline void vec_softmax(T* __restrict__ y, const U* __restrict__ x, int n) { +inline void vec_softmax(T* ET_RESTRICT y, const U* ET_RESTRICT x, int n) { U max_x = *std::max_element(x, x + n); T sum = 0; @@ -223,8 +224,8 @@ constexpr const T& clamp(const T& v, const T& lo, const T& hi) { /// Quantizes the elements of `x` into `y`, both of which must have `size` /// elements. Inverse of `dequantize_i8_f32()`. inline void quantize_i8_f32( - int8_t* __restrict__ y, - const float* __restrict__ x, + int8_t* ET_RESTRICT y, + const float* ET_RESTRICT x, float scale, int32_t zero_point, size_t size) { @@ -237,8 +238,8 @@ inline void quantize_i8_f32( /// Dequantizes the elements of `x` into `y`, both of which must have `size` /// elements. Inverse of `quantize_i8_f32()`. inline void dequantize_i8_f32( - float* __restrict__ y, - const int8_t* __restrict__ x, + float* ET_RESTRICT y, + const int8_t* ET_RESTRICT x, float scale, int32_t zero_point, size_t size) { diff --git a/runtime/platform/compiler.h b/runtime/platform/compiler.h index c970c12ea29..62324699923 100644 --- a/runtime/platform/compiler.h +++ b/runtime/platform/compiler.h @@ -65,6 +65,13 @@ #define ET_INLINE __attribute__((always_inline)) inline #endif +// Restrict +#if defined(_MSC_VER) +#define ET_RESTRICT __restrict +#else +#define ET_RESTRICT __restrict__ +#endif + #if defined(__GNUC__) #define ET_UNREACHABLE() __builtin_unreachable() From d7a8937038b0b10fb9f31daaf98d869745726b15 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Fri, 17 Oct 2025 11:47:04 -0700 Subject: [PATCH 2/2] buck include --- kernels/portable/cpu/targets.bzl | 1 + 1 file changed, 1 insertion(+) diff --git a/kernels/portable/cpu/targets.bzl b/kernels/portable/cpu/targets.bzl index 2d4848f9588..83e8219402f 100644 --- a/kernels/portable/cpu/targets.bzl +++ b/kernels/portable/cpu/targets.bzl @@ -43,6 +43,7 @@ def define_common_targets(): name = "vec_ops", exported_deps = [ "//executorch/runtime/core/portable_type/c10/c10:c10", + "//executorch/runtime/platform:compiler", ], srcs = [], exported_headers = ["vec_ops.h"],