Skip to content

Commit 77022d1

Browse files
committed
ET Restrict macro
1 parent 173b046 commit 77022d1

File tree

7 files changed

+53
-45
lines changed

7 files changed

+53
-45
lines changed

backends/cuda/runtime/shims/int4mm.cuh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -501,12 +501,12 @@ struct BLayout_TC_int4 {
501501
// 2 k-tiles packed is a uint32 (hence InnerKTiles == 2 is our smallest
502502
// value) 4 k-tiles packed is a uint32x2 (64 bits) 8 k-tiles packed is a
503503
// uint32x4 (128 bits)
504-
const void* __restrict__ B,
504+
const void* ET_RESTRICT B,
505505
// size [k / qGroupSize][n][2]
506506
// Contains the scale and zero point of each of the quantized int4 values
507507
// within B
508508
// v_reconstructed = (bf16(B_int4_val) * scale) - zero
509-
const void* __restrict__ quantizationInfo,
509+
const void* ET_RESTRICT quantizationInfo,
510510
int32_t n,
511511
int32_t k,
512512
int32_t nTiles,
@@ -643,16 +643,16 @@ template <
643643
__global__
644644
__launch_bounds__(Warps* kWarpSize) void tinygemm_m16n8k16_chunk_kernel(
645645
// Data for the A matrix, loaded as per ALayout
646-
const void* const __restrict__ A,
646+
const void* const ET_RESTRICT A,
647647

648648
// Data for the B matrix, loaded as per BLayout
649-
const void* const __restrict__ B,
649+
const void* const ET_RESTRICT B,
650650

651651
// Optional quantization data for dequantizing B, loaded as per BLayout
652-
const void* const __restrict__ B_quantizationInfo,
652+
const void* const ET_RESTRICT B_quantizationInfo,
653653

654654
// Output data for the C matrix, stored as per CLayout
655-
void* __restrict__ C,
655+
void* ET_RESTRICT C,
656656

657657
// The size of the matrix multiplication
658658
int32_t m,

backends/xnnpack/runtime/utils/utils.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -206,8 +206,8 @@ void vst1<int8_t, int8x8_t>(int8_t* out, int8x8_t vout) {
206206

207207
template <>
208208
void quantize_tensor_arm64_q8_wrapper<uint8_t>(
209-
const float* __restrict__ in,
210-
uint8_t* __restrict__ out,
209+
const float* ET_RESTRICT in,
210+
uint8_t* ET_RESTRICT out,
211211
const int64_t N,
212212
const float scale,
213213
const int32_t zero_point) {
@@ -216,8 +216,8 @@ void quantize_tensor_arm64_q8_wrapper<uint8_t>(
216216

217217
template <>
218218
void quantize_tensor_arm64_q8_wrapper<int8_t>(
219-
const float* __restrict__ in,
220-
int8_t* __restrict__ out,
219+
const float* ET_RESTRICT in,
220+
int8_t* ET_RESTRICT out,
221221
const int64_t N,
222222
const float scale,
223223
const int32_t zero_point) {

backends/xnnpack/runtime/utils/utils.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -82,8 +82,8 @@ void vst1(T* out, Tx8 vout);
8282

8383
template <typename underlying_t, typename underlying_x8_t>
8484
void quantize_tensor_arm64_q8(
85-
const float* __restrict__ in,
86-
underlying_t* __restrict__ out,
85+
const float* ET_RESTRICT in,
86+
underlying_t* ET_RESTRICT out,
8787
const int64_t N,
8888
const float scale,
8989
const int32_t zero_point) {
@@ -117,8 +117,8 @@ void quantize_tensor_arm64_q8(
117117

118118
template <typename T>
119119
void quantize_tensor_arm64_q8_wrapper(
120-
const float* __restrict__ in,
121-
T* __restrict__ out,
120+
const float* ET_RESTRICT in,
121+
T* ET_RESTRICT out,
122122
const int64_t N,
123123
const float scale,
124124
const int32_t zero_point);

kernels/optimized/cpu/op_log_softmax.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ namespace {
3131

3232
template <typename IN_T, typename OUT_T>
3333
void log_softmax_kernel(const Tensor& input, int64_t dim, Tensor& out) {
34-
const IN_T* __restrict__ input_data_base = input.const_data_ptr<IN_T>();
35-
OUT_T* __restrict__ output_data_base = out.mutable_data_ptr<OUT_T>();
34+
const IN_T* ET_RESTRICT input_data_base = input.const_data_ptr<IN_T>();
35+
OUT_T* ET_RESTRICT output_data_base = out.mutable_data_ptr<OUT_T>();
3636

3737
if (input.dim() == 0) {
3838
output_data_base[0] = 0;

kernels/portable/cpu/op_tril.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@ Tensor& clear_out(Tensor& out) {
3838
*/
3939
template <typename CTYPE>
4040
void apply_tril(
41-
CTYPE* __restrict__ self,
42-
CTYPE* __restrict__ out,
41+
CTYPE* ET_RESTRICT self,
42+
CTYPE* ET_RESTRICT out,
4343
int64_t diagonal,
4444
int64_t num_rows,
4545
int64_t num_cols,
@@ -104,8 +104,8 @@ void tril_kernel(
104104
int64_t col_stride = strides_ref[ndim - 1];
105105

106106
for (const auto i : c10::irange(batch_size)) {
107-
CTYPE* __restrict__ data_self_ptr = &data_self[i * self_stride];
108-
CTYPE* __restrict__ data_out_ptr = &data_out[i * self_stride];
107+
CTYPE* ET_RESTRICT data_self_ptr = &data_self[i * self_stride];
108+
CTYPE* ET_RESTRICT data_out_ptr = &data_out[i * self_stride];
109109

110110
apply_tril<CTYPE>(
111111
data_self_ptr,

kernels/portable/cpu/vec_ops.h

Lines changed: 26 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#pragma once
1111

1212
#include <c10/util/irange.h>
13+
#include <executorch/runtime/platform/compiler.h>
1314
#include <algorithm>
1415
#include <cmath>
1516
#include <cstdint>
@@ -45,9 +46,9 @@ inline float vec_maxf(const float* x, size_t size) {
4546
/// Add each element of `x` and `y` into the corresponding element of `z`. All
4647
/// arrays must have `size` elements.
4748
inline void vec_addf(
48-
float* __restrict__ z,
49-
const float* __restrict__ x,
50-
const float* __restrict__ y,
49+
float* ET_RESTRICT z,
50+
const float* ET_RESTRICT x,
51+
const float* ET_RESTRICT y,
5152
size_t size) {
5253
for (const auto i : c10::irange(size)) {
5354
z[i] = x[i] + y[i];
@@ -57,8 +58,8 @@ inline void vec_addf(
5758
/// Multiplies every element of `x` by `scale`, and writes the result into the
5859
/// corresponding element of `y`. `x` and `y` must have `size` elements.
5960
inline void vec_scalef(
60-
float* __restrict__ y,
61-
const float* __restrict__ x,
61+
float* ET_RESTRICT y,
62+
const float* ET_RESTRICT x,
6263
float scale,
6364
size_t size) {
6465
for (const auto i : c10::irange(size)) {
@@ -70,9 +71,9 @@ inline void vec_scalef(
7071
/// z[i][j] = sum(x[i][k] * y[k][j])
7172
template <typename T, typename U = T>
7273
inline void vec_matmul(
73-
T* __restrict__ z,
74-
const U* __restrict__ x,
75-
const U* __restrict__ y,
74+
T* ET_RESTRICT z,
75+
const U* ET_RESTRICT x,
76+
const U* ET_RESTRICT y,
7677
int64_t m,
7778
int64_t n,
7879
int64_t p) {
@@ -89,10 +90,10 @@ inline void vec_matmul(
8990

9091
template <typename T, typename U = T>
9192
inline void vec_quantized_matmul_int8(
92-
T* __restrict__ z,
93-
const U* __restrict__ x,
94-
const int8_t* __restrict__ y,
95-
const U* __restrict__ s,
93+
T* ET_RESTRICT z,
94+
const U* ET_RESTRICT x,
95+
const int8_t* ET_RESTRICT y,
96+
const U* ET_RESTRICT s,
9697
int64_t m,
9798
int64_t n,
9899
int64_t p) {
@@ -115,10 +116,10 @@ static inline size_t bounds_min(size_t a, size_t b) {
115116
/// z[i][j] = sum(x[i][k] * y[j][k] * s[j][k/g])
116117
template <typename T, typename U = T, typename V = U>
117118
inline void vec_quantized_matmul_transb_int8(
118-
T* __restrict__ z,
119-
const U* __restrict__ x,
120-
const int8_t* __restrict__ y,
121-
const V* __restrict__ s,
119+
T* ET_RESTRICT z,
120+
const U* ET_RESTRICT x,
121+
const int8_t* ET_RESTRICT y,
122+
const V* ET_RESTRICT s,
122123
int64_t m,
123124
int64_t n,
124125
int64_t p,
@@ -146,10 +147,10 @@ inline void vec_quantized_matmul_transb_int8(
146147
// T for tensor dtype, U for scalar type
147148
template <typename T, typename U = T>
148149
inline void vec_addmm(
149-
T* __restrict__ out_data,
150-
const T* __restrict__ self_data,
151-
const T* __restrict__ mat1_data,
152-
const T* __restrict__ mat2_data,
150+
T* ET_RESTRICT out_data,
151+
const T* ET_RESTRICT self_data,
152+
const T* ET_RESTRICT mat1_data,
153+
const T* ET_RESTRICT mat2_data,
153154
int64_t m,
154155
int64_t n,
155156
int64_t p,
@@ -195,7 +196,7 @@ template <
195196
typename checkU = typename std::enable_if<
196197
std::is_same<float, typename std::remove_cv<U>::type>::value ||
197198
std::is_same<double, typename std::remove_cv<U>::type>::value>::type>
198-
inline void vec_softmax(T* __restrict__ y, const U* __restrict__ x, int n) {
199+
inline void vec_softmax(T* ET_RESTRICT y, const U* ET_RESTRICT x, int n) {
199200
U max_x = *std::max_element(x, x + n);
200201
T sum = 0;
201202

@@ -223,8 +224,8 @@ constexpr const T& clamp(const T& v, const T& lo, const T& hi) {
223224
/// Quantizes the elements of `x` into `y`, both of which must have `size`
224225
/// elements. Inverse of `dequantize_i8_f32()`.
225226
inline void quantize_i8_f32(
226-
int8_t* __restrict__ y,
227-
const float* __restrict__ x,
227+
int8_t* ET_RESTRICT y,
228+
const float* ET_RESTRICT x,
228229
float scale,
229230
int32_t zero_point,
230231
size_t size) {
@@ -237,8 +238,8 @@ inline void quantize_i8_f32(
237238
/// Dequantizes the elements of `x` into `y`, both of which must have `size`
238239
/// elements. Inverse of `quantize_i8_f32()`.
239240
inline void dequantize_i8_f32(
240-
float* __restrict__ y,
241-
const int8_t* __restrict__ x,
241+
float* ET_RESTRICT y,
242+
const int8_t* ET_RESTRICT x,
242243
float scale,
243244
int32_t zero_point,
244245
size_t size) {

runtime/platform/compiler.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,13 @@
6565
#define ET_INLINE __attribute__((always_inline)) inline
6666
#endif
6767

68+
// Restrict
69+
#if defined(_MSC_VER)
70+
#define ET_RESTRICT __restrict
71+
#else
72+
#define ET_RESTRICT __restrict__
73+
#endif
74+
6875
#if defined(__GNUC__)
6976

7077
#define ET_UNREACHABLE() __builtin_unreachable()

0 commit comments

Comments
 (0)