pytorch · JacobSzwejbka · Oct 17, 2025 · Oct 17, 2025 · Oct 17, 2025 · Oct 17, 2025
diff --git a/backends/cuda/runtime/shims/int4mm.cuh b/backends/cuda/runtime/shims/int4mm.cuh
@@ -501,12 +501,12 @@ struct BLayout_TC_int4 {
       // 2 k-tiles packed is a uint32 (hence InnerKTiles == 2 is our smallest
       // value) 4 k-tiles packed is a uint32x2 (64 bits) 8 k-tiles packed is a
       // uint32x4 (128 bits)
-      const void* __restrict__ B,
+      const void* ET_RESTRICT B,
       // size [k / qGroupSize][n][2]
       // Contains the scale and zero point of each of the quantized int4 values
       // within B
       // v_reconstructed = (bf16(B_int4_val) * scale) - zero
-      const void* __restrict__ quantizationInfo,
+      const void* ET_RESTRICT quantizationInfo,
       int32_t n,
       int32_t k,
       int32_t nTiles,
@@ -643,16 +643,16 @@ template <
 __global__
 __launch_bounds__(Warps* kWarpSize) void tinygemm_m16n8k16_chunk_kernel(
     // Data for the A matrix, loaded as per ALayout
-    const void* const __restrict__ A,
+    const void* const ET_RESTRICT A,
 
     // Data for the B matrix, loaded as per BLayout
-    const void* const __restrict__ B,
+    const void* const ET_RESTRICT B,
 
     // Optional quantization data for dequantizing B, loaded as per BLayout
-    const void* const __restrict__ B_quantizationInfo,
+    const void* const ET_RESTRICT B_quantizationInfo,
 
     // Output data for the C matrix, stored as per CLayout
-    void* __restrict__ C,
+    void* ET_RESTRICT C,
 
     // The size of the matrix multiplication
     int32_t m,

@@ -206,8 +206,8 @@ void vst1<int8_t, int8x8_t>(int8_t* out, int8x8_t vout) {
 
 template <>
 void quantize_tensor_arm64_q8_wrapper<uint8_t>(
-    const float* __restrict__ in,
-    uint8_t* __restrict__ out,
+    const float* ET_RESTRICT in,
+    uint8_t* ET_RESTRICT out,
     const int64_t N,
     const float scale,
     const int32_t zero_point) {
@@ -216,8 +216,8 @@ void quantize_tensor_arm64_q8_wrapper<uint8_t>(
 
 template <>
 void quantize_tensor_arm64_q8_wrapper<int8_t>(
-    const float* __restrict__ in,
-    int8_t* __restrict__ out,
+    const float* ET_RESTRICT in,
+    int8_t* ET_RESTRICT out,
     const int64_t N,
     const float scale,
     const int32_t zero_point) {

@@ -82,8 +82,8 @@ void vst1(T* out, Tx8 vout);
 
 template <typename underlying_t, typename underlying_x8_t>
 void quantize_tensor_arm64_q8(
-    const float* __restrict__ in,
-    underlying_t* __restrict__ out,
+    const float* ET_RESTRICT in,
+    underlying_t* ET_RESTRICT out,
     const int64_t N,
     const float scale,
     const int32_t zero_point) {
@@ -117,8 +117,8 @@ void quantize_tensor_arm64_q8(
 
 template <typename T>
 void quantize_tensor_arm64_q8_wrapper(
-    const float* __restrict__ in,
-    T* __restrict__ out,
+    const float* ET_RESTRICT in,
+    T* ET_RESTRICT out,
     const int64_t N,
     const float scale,
     const int32_t zero_point);

@@ -31,8 +31,8 @@ namespace {
 
 template <typename IN_T, typename OUT_T>
 void log_softmax_kernel(const Tensor& input, int64_t dim, Tensor& out) {
-  const IN_T* __restrict__ input_data_base = input.const_data_ptr<IN_T>();
-  OUT_T* __restrict__ output_data_base = out.mutable_data_ptr<OUT_T>();
+  const IN_T* ET_RESTRICT input_data_base = input.const_data_ptr<IN_T>();
+  OUT_T* ET_RESTRICT output_data_base = out.mutable_data_ptr<OUT_T>();
 
   if (input.dim() == 0) {
     output_data_base[0] = 0;

@@ -38,8 +38,8 @@ Tensor& clear_out(Tensor& out) {
  */
 template <typename CTYPE>
 void apply_tril(
-    CTYPE* __restrict__ self,
-    CTYPE* __restrict__ out,
+    CTYPE* ET_RESTRICT self,
+    CTYPE* ET_RESTRICT out,
     int64_t diagonal,
     int64_t num_rows,
     int64_t num_cols,
@@ -104,8 +104,8 @@ void tril_kernel(
   int64_t col_stride = strides_ref[ndim - 1];
 
   for (const auto i : c10::irange(batch_size)) {
-    CTYPE* __restrict__ data_self_ptr = &data_self[i * self_stride];
-    CTYPE* __restrict__ data_out_ptr = &data_out[i * self_stride];
+    CTYPE* ET_RESTRICT data_self_ptr = &data_self[i * self_stride];
+    CTYPE* ET_RESTRICT data_out_ptr = &data_out[i * self_stride];
 
     apply_tril<CTYPE>(
         data_self_ptr,

@@ -43,6 +43,7 @@ def define_common_targets():
         name = "vec_ops",
         exported_deps = [
             "//executorch/runtime/core/portable_type/c10/c10:c10",
+            "//executorch/runtime/platform:compiler",
         ],
         srcs = [],
         exported_headers = ["vec_ops.h"],

@@ -10,6 +10,7 @@
 #pragma once
 
 #include <c10/util/irange.h>
+#include <executorch/runtime/platform/compiler.h>
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
@@ -45,9 +46,9 @@ inline float vec_maxf(const float* x, size_t size) {
 /// Add each element of `x` and `y` into the corresponding element of `z`. All
 /// arrays must have `size` elements.
 inline void vec_addf(
-    float* __restrict__ z,
-    const float* __restrict__ x,
-    const float* __restrict__ y,
+    float* ET_RESTRICT z,
+    const float* ET_RESTRICT x,
+    const float* ET_RESTRICT y,
     size_t size) {
   for (const auto i : c10::irange(size)) {
     z[i] = x[i] + y[i];
@@ -57,8 +58,8 @@ inline void vec_addf(
 /// Multiplies every element of `x` by `scale`, and writes the result into the
 /// corresponding element of `y`. `x` and `y` must have `size` elements.
 inline void vec_scalef(
-    float* __restrict__ y,
-    const float* __restrict__ x,
+    float* ET_RESTRICT y,
+    const float* ET_RESTRICT x,
     float scale,
     size_t size) {
   for (const auto i : c10::irange(size)) {
@@ -70,9 +71,9 @@ inline void vec_scalef(
 /// z[i][j] = sum(x[i][k] * y[k][j])
 template <typename T, typename U = T>
 inline void vec_matmul(
-    T* __restrict__ z,
-    const U* __restrict__ x,
-    const U* __restrict__ y,
+    T* ET_RESTRICT z,
+    const U* ET_RESTRICT x,
+    const U* ET_RESTRICT y,
     int64_t m,
     int64_t n,
     int64_t p) {
@@ -89,10 +90,10 @@ inline void vec_matmul(
 
 template <typename T, typename U = T>
 inline void vec_quantized_matmul_int8(
-    T* __restrict__ z,
-    const U* __restrict__ x,
-    const int8_t* __restrict__ y,
-    const U* __restrict__ s,
+    T* ET_RESTRICT z,
+    const U* ET_RESTRICT x,
+    const int8_t* ET_RESTRICT y,
+    const U* ET_RESTRICT s,
     int64_t m,
     int64_t n,
     int64_t p) {
@@ -115,10 +116,10 @@ static inline size_t bounds_min(size_t a, size_t b) {
 /// z[i][j] = sum(x[i][k] * y[j][k] * s[j][k/g])
 template <typename T, typename U = T, typename V = U>
 inline void vec_quantized_matmul_transb_int8(
-    T* __restrict__ z,
-    const U* __restrict__ x,
-    const int8_t* __restrict__ y,
-    const V* __restrict__ s,
+    T* ET_RESTRICT z,
+    const U* ET_RESTRICT x,
+    const int8_t* ET_RESTRICT y,
+    const V* ET_RESTRICT s,
     int64_t m,
     int64_t n,
     int64_t p,
@@ -146,10 +147,10 @@ inline void vec_quantized_matmul_transb_int8(
 // T for tensor dtype, U for scalar type
 template <typename T, typename U = T>
 inline void vec_addmm(
-    T* __restrict__ out_data,
-    const T* __restrict__ self_data,
-    const T* __restrict__ mat1_data,
-    const T* __restrict__ mat2_data,
+    T* ET_RESTRICT out_data,
+    const T* ET_RESTRICT self_data,
+    const T* ET_RESTRICT mat1_data,
+    const T* ET_RESTRICT mat2_data,
     int64_t m,
     int64_t n,
     int64_t p,
@@ -195,7 +196,7 @@ template <
     typename checkU = typename std::enable_if<
         std::is_same<float, typename std::remove_cv<U>::type>::value ||
         std::is_same<double, typename std::remove_cv<U>::type>::value>::type>
-inline void vec_softmax(T* __restrict__ y, const U* __restrict__ x, int n) {
+inline void vec_softmax(T* ET_RESTRICT y, const U* ET_RESTRICT x, int n) {
   U max_x = *std::max_element(x, x + n);
   T sum = 0;
 
@@ -223,8 +224,8 @@ constexpr const T& clamp(const T& v, const T& lo, const T& hi) {
 /// Quantizes the elements of `x` into `y`, both of which must have `size`
 /// elements. Inverse of `dequantize_i8_f32()`.
 inline void quantize_i8_f32(
-    int8_t* __restrict__ y,
-    const float* __restrict__ x,
+    int8_t* ET_RESTRICT y,
+    const float* ET_RESTRICT x,
     float scale,
     int32_t zero_point,
     size_t size) {
@@ -237,8 +238,8 @@ inline void quantize_i8_f32(
 /// Dequantizes the elements of `x` into `y`, both of which must have `size`
 /// elements. Inverse of `quantize_i8_f32()`.
 inline void dequantize_i8_f32(
-    float* __restrict__ y,
-    const int8_t* __restrict__ x,
+    float* ET_RESTRICT y,
+    const int8_t* ET_RESTRICT x,
     float scale,
     int32_t zero_point,
     size_t size) {

@@ -65,6 +65,13 @@
 #define ET_INLINE __attribute__((always_inline)) inline
 #endif
 
+// Restrict
+#if defined(_MSC_VER)
+#define ET_RESTRICT __restrict
+#else
+#define ET_RESTRICT __restrict__
+#endif
+
 #if defined(__GNUC__)
 
 #define ET_UNREACHABLE() __builtin_unreachable()