From 77022d1836192bf722dbe5096d88b1459484a425 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Thu, 16 Oct 2025 22:13:16 -0700
Subject: [PATCH 1/2] ET Restrict macro

---
 backends/cuda/runtime/shims/int4mm.cuh   | 12 +++---
 backends/xnnpack/runtime/utils/utils.cpp |  8 ++--
 backends/xnnpack/runtime/utils/utils.h   |  8 ++--
 kernels/optimized/cpu/op_log_softmax.cpp |  4 +-
 kernels/portable/cpu/op_tril.cpp         |  8 ++--
 kernels/portable/cpu/vec_ops.h           | 51 ++++++++++++------------
 runtime/platform/compiler.h              |  7 ++++
 7 files changed, 53 insertions(+), 45 deletions(-)
diff --git a/backends/cuda/runtime/shims/int4mm.cuh b/backends/cuda/runtime/shims/int4mm.cuh
index 7f756d7c831..ee12fb51004 100644
--- a/backends/cuda/runtime/shims/int4mm.cuh
+++ b/backends/cuda/runtime/shims/int4mm.cuh
@@ -501,12 +501,12 @@ struct BLayout_TC_int4 {
       // 2 k-tiles packed is a uint32 (hence InnerKTiles == 2 is our smallest
       // value) 4 k-tiles packed is a uint32x2 (64 bits) 8 k-tiles packed is a
       // uint32x4 (128 bits)
-      const void* __restrict__ B,
+      const void* ET_RESTRICT B,
       // size [k / qGroupSize][n][2]
       // Contains the scale and zero point of each of the quantized int4 values
       // within B
       // v_reconstructed = (bf16(B_int4_val) * scale) - zero
-      const void* __restrict__ quantizationInfo,
+      const void* ET_RESTRICT quantizationInfo,
       int32_t n,
       int32_t k,
       int32_t nTiles,
@@ -643,16 +643,16 @@ template <
 __global__
 __launch_bounds__(Warps* kWarpSize) void tinygemm_m16n8k16_chunk_kernel(
     // Data for the A matrix, loaded as per ALayout
-    const void* const __restrict__ A,
+    const void* const ET_RESTRICT A,
 
     // Data for the B matrix, loaded as per BLayout
-    const void* const __restrict__ B,
+    const void* const ET_RESTRICT B,
 
     // Optional quantization data for dequantizing B, loaded as per BLayout
-    const void* const __restrict__ B_quantizationInfo,
+    const void* const ET_RESTRICT B_quantizationInfo,
 
     // Output data for the C matrix, stored as per CLayout
-    void* __restrict__ C,
+    void* ET_RESTRICT C,
 
     // The size of the matrix multiplication
     int32_t m,
diff --git a/backends/xnnpack/runtime/utils/utils.cpp b/backends/xnnpack/runtime/utils/utils.cpp
index bbcb8bc071c..0e017df978b 100644
--- a/backends/xnnpack/runtime/utils/utils.cpp
+++ b/backends/xnnpack/runtime/utils/utils.cpp
@@ -206,8 +206,8 @@ void vst1<int8_t, int8x8_t>(int8_t* out, int8x8_t vout) {
 
 template <>
 void quantize_tensor_arm64_q8_wrapper<uint8_t>(
-    const float* __restrict__ in,
-    uint8_t* __restrict__ out,
+    const float* ET_RESTRICT in,
+    uint8_t* ET_RESTRICT out,
     const int64_t N,
     const float scale,
     const int32_t zero_point) {
@@ -216,8 +216,8 @@ void quantize_tensor_arm64_q8_wrapper<uint8_t>(
 
 template <>
 void quantize_tensor_arm64_q8_wrapper<int8_t>(
-    const float* __restrict__ in,
-    int8_t* __restrict__ out,
+    const float* ET_RESTRICT in,
+    int8_t* ET_RESTRICT out,
     const int64_t N,
     const float scale,
     const int32_t zero_point) {
diff --git a/backends/xnnpack/runtime/utils/utils.h b/backends/xnnpack/runtime/utils/utils.h
index 2eb079f0b0c..de8ee7970dd 100644
--- a/backends/xnnpack/runtime/utils/utils.h
+++ b/backends/xnnpack/runtime/utils/utils.h
@@ -82,8 +82,8 @@ void vst1(T* out, Tx8 vout);
 
 template <typename underlying_t, typename underlying_x8_t>
 void quantize_tensor_arm64_q8(
-    const float* __restrict__ in,
-    underlying_t* __restrict__ out,
+    const float* ET_RESTRICT in,
+    underlying_t* ET_RESTRICT out,
     const int64_t N,
     const float scale,
     const int32_t zero_point) {
@@ -117,8 +117,8 @@ void quantize_tensor_arm64_q8(
 
 template <typename T>
 void quantize_tensor_arm64_q8_wrapper(
-    const float* __restrict__ in,
-    T* __restrict__ out,
+    const float* ET_RESTRICT in,
+    T* ET_RESTRICT out,
     const int64_t N,
     const float scale,
     const int32_t zero_point);
diff --git a/kernels/optimized/cpu/op_log_softmax.cpp b/kernels/optimized/cpu/op_log_softmax.cpp
index f56b0a37de2..629a81a6429 100644
--- a/kernels/optimized/cpu/op_log_softmax.cpp
+++ b/kernels/optimized/cpu/op_log_softmax.cpp
@@ -31,8 +31,8 @@ namespace {
 
 template <typename IN_T, typename OUT_T>
 void log_softmax_kernel(const Tensor& input, int64_t dim, Tensor& out) {
-  const IN_T* __restrict__ input_data_base = input.const_data_ptr<IN_T>();
-  OUT_T* __restrict__ output_data_base = out.mutable_data_ptr<OUT_T>();
+  const IN_T* ET_RESTRICT input_data_base = input.const_data_ptr<IN_T>();
+  OUT_T* ET_RESTRICT output_data_base = out.mutable_data_ptr<OUT_T>();
 
   if (input.dim() == 0) {
     output_data_base[0] = 0;
diff --git a/kernels/portable/cpu/op_tril.cpp b/kernels/portable/cpu/op_tril.cpp
index b21c9918a99..5fb43928883 100644
--- a/kernels/portable/cpu/op_tril.cpp
+++ b/kernels/portable/cpu/op_tril.cpp
@@ -38,8 +38,8 @@ Tensor& clear_out(Tensor& out) {
  */
 template <typename CTYPE>
 void apply_tril(
-    CTYPE* __restrict__ self,
-    CTYPE* __restrict__ out,
+    CTYPE* ET_RESTRICT self,
+    CTYPE* ET_RESTRICT out,
     int64_t diagonal,
     int64_t num_rows,
     int64_t num_cols,
@@ -104,8 +104,8 @@ void tril_kernel(
   int64_t col_stride = strides_ref[ndim - 1];
 
   for (const auto i : c10::irange(batch_size)) {
-    CTYPE* __restrict__ data_self_ptr = &data_self[i * self_stride];
-    CTYPE* __restrict__ data_out_ptr = &data_out[i * self_stride];
+    CTYPE* ET_RESTRICT data_self_ptr = &data_self[i * self_stride];
+    CTYPE* ET_RESTRICT data_out_ptr = &data_out[i * self_stride];
 
     apply_tril<CTYPE>(
         data_self_ptr,
diff --git a/kernels/portable/cpu/vec_ops.h b/kernels/portable/cpu/vec_ops.h
index 7a1a488701b..87dd05ac7d4 100644
--- a/kernels/portable/cpu/vec_ops.h
+++ b/kernels/portable/cpu/vec_ops.h
@@ -10,6 +10,7 @@
 #pragma once
 
 #include <c10/util/irange.h>
+#include <executorch/runtime/platform/compiler.h>
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
@@ -45,9 +46,9 @@ inline float vec_maxf(const float* x, size_t size) {
 /// Add each element of `x` and `y` into the corresponding element of `z`. All
 /// arrays must have `size` elements.
 inline void vec_addf(
-    float* __restrict__ z,
-    const float* __restrict__ x,
-    const float* __restrict__ y,
+    float* ET_RESTRICT z,
+    const float* ET_RESTRICT x,
+    const float* ET_RESTRICT y,
     size_t size) {
   for (const auto i : c10::irange(size)) {
     z[i] = x[i] + y[i];
@@ -57,8 +58,8 @@ inline void vec_addf(
 /// Multiplies every element of `x` by `scale`, and writes the result into the
 /// corresponding element of `y`. `x` and `y` must have `size` elements.
 inline void vec_scalef(
-    float* __restrict__ y,
-    const float* __restrict__ x,
+    float* ET_RESTRICT y,
+    const float* ET_RESTRICT x,
     float scale,
     size_t size) {
   for (const auto i : c10::irange(size)) {
@@ -70,9 +71,9 @@ inline void vec_scalef(
 /// z[i][j] = sum(x[i][k] * y[k][j])
 template <typename T, typename U = T>
 inline void vec_matmul(
-    T* __restrict__ z,
-    const U* __restrict__ x,
-    const U* __restrict__ y,
+    T* ET_RESTRICT z,
+    const U* ET_RESTRICT x,
+    const U* ET_RESTRICT y,
     int64_t m,
     int64_t n,
     int64_t p) {
@@ -89,10 +90,10 @@ inline void vec_matmul(
 
 template <typename T, typename U = T>
 inline void vec_quantized_matmul_int8(
-    T* __restrict__ z,
-    const U* __restrict__ x,
-    const int8_t* __restrict__ y,
-    const U* __restrict__ s,
+    T* ET_RESTRICT z,
+    const U* ET_RESTRICT x,
+    const int8_t* ET_RESTRICT y,
+    const U* ET_RESTRICT s,
     int64_t m,
     int64_t n,
     int64_t p) {
@@ -115,10 +116,10 @@ static inline size_t bounds_min(size_t a, size_t b) {
 /// z[i][j] = sum(x[i][k] * y[j][k] * s[j][k/g])
 template <typename T, typename U = T, typename V = U>
 inline void vec_quantized_matmul_transb_int8(
-    T* __restrict__ z,
-    const U* __restrict__ x,
-    const int8_t* __restrict__ y,
-    const V* __restrict__ s,
+    T* ET_RESTRICT z,
+    const U* ET_RESTRICT x,
+    const int8_t* ET_RESTRICT y,
+    const V* ET_RESTRICT s,
     int64_t m,
     int64_t n,
     int64_t p,
@@ -146,10 +147,10 @@ inline void vec_quantized_matmul_transb_int8(
 // T for tensor dtype, U for scalar type
 template <typename T, typename U = T>
 inline void vec_addmm(
-    T* __restrict__ out_data,
-    const T* __restrict__ self_data,
-    const T* __restrict__ mat1_data,
-    const T* __restrict__ mat2_data,
+    T* ET_RESTRICT out_data,
+    const T* ET_RESTRICT self_data,
+    const T* ET_RESTRICT mat1_data,
+    const T* ET_RESTRICT mat2_data,
     int64_t m,
     int64_t n,
     int64_t p,
@@ -195,7 +196,7 @@ template <
     typename checkU = typename std::enable_if<
         std::is_same<float, typename std::remove_cv<U>::type>::value ||
         std::is_same<double, typename std::remove_cv<U>::type>::value>::type>
-inline void vec_softmax(T* __restrict__ y, const U* __restrict__ x, int n) {
+inline void vec_softmax(T* ET_RESTRICT y, const U* ET_RESTRICT x, int n) {
   U max_x = *std::max_element(x, x + n);
   T sum = 0;
 
@@ -223,8 +224,8 @@ constexpr const T& clamp(const T& v, const T& lo, const T& hi) {
 /// Quantizes the elements of `x` into `y`, both of which must have `size`
 /// elements. Inverse of `dequantize_i8_f32()`.
 inline void quantize_i8_f32(
-    int8_t* __restrict__ y,
-    const float* __restrict__ x,
+    int8_t* ET_RESTRICT y,
+    const float* ET_RESTRICT x,
     float scale,
     int32_t zero_point,
     size_t size) {
@@ -237,8 +238,8 @@ inline void quantize_i8_f32(
 /// Dequantizes the elements of `x` into `y`, both of which must have `size`
 /// elements. Inverse of `quantize_i8_f32()`.
 inline void dequantize_i8_f32(
-    float* __restrict__ y,
-    const int8_t* __restrict__ x,
+    float* ET_RESTRICT y,
+    const int8_t* ET_RESTRICT x,
     float scale,
     int32_t zero_point,
     size_t size) {
diff --git a/runtime/platform/compiler.h b/runtime/platform/compiler.h
index c970c12ea29..62324699923 100644
--- a/runtime/platform/compiler.h
+++ b/runtime/platform/compiler.h
@@ -65,6 +65,13 @@
 #define ET_INLINE __attribute__((always_inline)) inline
 #endif
 
+// Restrict
+#if defined(_MSC_VER)
+#define ET_RESTRICT __restrict
+#else
+#define ET_RESTRICT __restrict__
+#endif
+
 #if defined(__GNUC__)
 
 #define ET_UNREACHABLE() __builtin_unreachable()

From d7a8937038b0b10fb9f31daaf98d869745726b15 Mon Sep 17 00:00:00 2001
From: Jacob Szwejbka <jakeszwe@fb.com>
Date: Fri, 17 Oct 2025 11:47:04 -0700
Subject: [PATCH 2/2] buck include

---
 kernels/portable/cpu/targets.bzl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernels/portable/cpu/targets.bzl b/kernels/portable/cpu/targets.bzl
index 2d4848f9588..83e8219402f 100644
--- a/kernels/portable/cpu/targets.bzl
+++ b/kernels/portable/cpu/targets.bzl
@@ -43,6 +43,7 @@ def define_common_targets():
         name = "vec_ops",
         exported_deps = [
             "//executorch/runtime/core/portable_type/c10/c10:c10",
+            "//executorch/runtime/platform:compiler",
         ],
         srcs = [],
         exported_headers = ["vec_ops.h"],