Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions backends/cuda/runtime/shims/int4mm.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -501,12 +501,12 @@ struct BLayout_TC_int4 {
// 2 k-tiles packed is a uint32 (hence InnerKTiles == 2 is our smallest
// value) 4 k-tiles packed is a uint32x2 (64 bits) 8 k-tiles packed is a
// uint32x4 (128 bits)
const void* __restrict__ B,
const void* ET_RESTRICT B,
// size [k / qGroupSize][n][2]
// Contains the scale and zero point of each of the quantized int4 values
// within B
// v_reconstructed = (bf16(B_int4_val) * scale) - zero
const void* __restrict__ quantizationInfo,
const void* ET_RESTRICT quantizationInfo,
int32_t n,
int32_t k,
int32_t nTiles,
Expand Down Expand Up @@ -643,16 +643,16 @@ template <
__global__
__launch_bounds__(Warps* kWarpSize) void tinygemm_m16n8k16_chunk_kernel(
// Data for the A matrix, loaded as per ALayout
const void* const __restrict__ A,
const void* const ET_RESTRICT A,

// Data for the B matrix, loaded as per BLayout
const void* const __restrict__ B,
const void* const ET_RESTRICT B,

// Optional quantization data for dequantizing B, loaded as per BLayout
const void* const __restrict__ B_quantizationInfo,
const void* const ET_RESTRICT B_quantizationInfo,

// Output data for the C matrix, stored as per CLayout
void* __restrict__ C,
void* ET_RESTRICT C,

// The size of the matrix multiplication
int32_t m,
Expand Down
8 changes: 4 additions & 4 deletions backends/xnnpack/runtime/utils/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,8 +206,8 @@ void vst1<int8_t, int8x8_t>(int8_t* out, int8x8_t vout) {

template <>
void quantize_tensor_arm64_q8_wrapper<uint8_t>(
const float* __restrict__ in,
uint8_t* __restrict__ out,
const float* ET_RESTRICT in,
uint8_t* ET_RESTRICT out,
const int64_t N,
const float scale,
const int32_t zero_point) {
Expand All @@ -216,8 +216,8 @@ void quantize_tensor_arm64_q8_wrapper<uint8_t>(

template <>
void quantize_tensor_arm64_q8_wrapper<int8_t>(
const float* __restrict__ in,
int8_t* __restrict__ out,
const float* ET_RESTRICT in,
int8_t* ET_RESTRICT out,
const int64_t N,
const float scale,
const int32_t zero_point) {
Expand Down
8 changes: 4 additions & 4 deletions backends/xnnpack/runtime/utils/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,8 @@ void vst1(T* out, Tx8 vout);

template <typename underlying_t, typename underlying_x8_t>
void quantize_tensor_arm64_q8(
const float* __restrict__ in,
underlying_t* __restrict__ out,
const float* ET_RESTRICT in,
underlying_t* ET_RESTRICT out,
const int64_t N,
const float scale,
const int32_t zero_point) {
Expand Down Expand Up @@ -117,8 +117,8 @@ void quantize_tensor_arm64_q8(

template <typename T>
void quantize_tensor_arm64_q8_wrapper(
const float* __restrict__ in,
T* __restrict__ out,
const float* ET_RESTRICT in,
T* ET_RESTRICT out,
const int64_t N,
const float scale,
const int32_t zero_point);
Expand Down
4 changes: 2 additions & 2 deletions kernels/optimized/cpu/op_log_softmax.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ namespace {

template <typename IN_T, typename OUT_T>
void log_softmax_kernel(const Tensor& input, int64_t dim, Tensor& out) {
const IN_T* __restrict__ input_data_base = input.const_data_ptr<IN_T>();
OUT_T* __restrict__ output_data_base = out.mutable_data_ptr<OUT_T>();
const IN_T* ET_RESTRICT input_data_base = input.const_data_ptr<IN_T>();
OUT_T* ET_RESTRICT output_data_base = out.mutable_data_ptr<OUT_T>();

if (input.dim() == 0) {
output_data_base[0] = 0;
Expand Down
8 changes: 4 additions & 4 deletions kernels/portable/cpu/op_tril.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ Tensor& clear_out(Tensor& out) {
*/
template <typename CTYPE>
void apply_tril(
CTYPE* __restrict__ self,
CTYPE* __restrict__ out,
CTYPE* ET_RESTRICT self,
CTYPE* ET_RESTRICT out,
int64_t diagonal,
int64_t num_rows,
int64_t num_cols,
Expand Down Expand Up @@ -104,8 +104,8 @@ void tril_kernel(
int64_t col_stride = strides_ref[ndim - 1];

for (const auto i : c10::irange(batch_size)) {
CTYPE* __restrict__ data_self_ptr = &data_self[i * self_stride];
CTYPE* __restrict__ data_out_ptr = &data_out[i * self_stride];
CTYPE* ET_RESTRICT data_self_ptr = &data_self[i * self_stride];
CTYPE* ET_RESTRICT data_out_ptr = &data_out[i * self_stride];

apply_tril<CTYPE>(
data_self_ptr,
Expand Down
1 change: 1 addition & 0 deletions kernels/portable/cpu/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def define_common_targets():
name = "vec_ops",
exported_deps = [
"//executorch/runtime/core/portable_type/c10/c10:c10",
"//executorch/runtime/platform:compiler",
],
srcs = [],
exported_headers = ["vec_ops.h"],
Expand Down
51 changes: 26 additions & 25 deletions kernels/portable/cpu/vec_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#pragma once

#include <c10/util/irange.h>
#include <executorch/runtime/platform/compiler.h>
#include <algorithm>
#include <cmath>
#include <cstdint>
Expand Down Expand Up @@ -45,9 +46,9 @@ inline float vec_maxf(const float* x, size_t size) {
/// Add each element of `x` and `y` into the corresponding element of `z`. All
/// arrays must have `size` elements.
inline void vec_addf(
float* __restrict__ z,
const float* __restrict__ x,
const float* __restrict__ y,
float* ET_RESTRICT z,
const float* ET_RESTRICT x,
const float* ET_RESTRICT y,
size_t size) {
for (const auto i : c10::irange(size)) {
z[i] = x[i] + y[i];
Expand All @@ -57,8 +58,8 @@ inline void vec_addf(
/// Multiplies every element of `x` by `scale`, and writes the result into the
/// corresponding element of `y`. `x` and `y` must have `size` elements.
inline void vec_scalef(
float* __restrict__ y,
const float* __restrict__ x,
float* ET_RESTRICT y,
const float* ET_RESTRICT x,
float scale,
size_t size) {
for (const auto i : c10::irange(size)) {
Expand All @@ -70,9 +71,9 @@ inline void vec_scalef(
/// z[i][j] = sum(x[i][k] * y[k][j])
template <typename T, typename U = T>
inline void vec_matmul(
T* __restrict__ z,
const U* __restrict__ x,
const U* __restrict__ y,
T* ET_RESTRICT z,
const U* ET_RESTRICT x,
const U* ET_RESTRICT y,
int64_t m,
int64_t n,
int64_t p) {
Expand All @@ -89,10 +90,10 @@ inline void vec_matmul(

template <typename T, typename U = T>
inline void vec_quantized_matmul_int8(
T* __restrict__ z,
const U* __restrict__ x,
const int8_t* __restrict__ y,
const U* __restrict__ s,
T* ET_RESTRICT z,
const U* ET_RESTRICT x,
const int8_t* ET_RESTRICT y,
const U* ET_RESTRICT s,
int64_t m,
int64_t n,
int64_t p) {
Expand All @@ -115,10 +116,10 @@ static inline size_t bounds_min(size_t a, size_t b) {
/// z[i][j] = sum(x[i][k] * y[j][k] * s[j][k/g])
template <typename T, typename U = T, typename V = U>
inline void vec_quantized_matmul_transb_int8(
T* __restrict__ z,
const U* __restrict__ x,
const int8_t* __restrict__ y,
const V* __restrict__ s,
T* ET_RESTRICT z,
const U* ET_RESTRICT x,
const int8_t* ET_RESTRICT y,
const V* ET_RESTRICT s,
int64_t m,
int64_t n,
int64_t p,
Expand Down Expand Up @@ -146,10 +147,10 @@ inline void vec_quantized_matmul_transb_int8(
// T for tensor dtype, U for scalar type
template <typename T, typename U = T>
inline void vec_addmm(
T* __restrict__ out_data,
const T* __restrict__ self_data,
const T* __restrict__ mat1_data,
const T* __restrict__ mat2_data,
T* ET_RESTRICT out_data,
const T* ET_RESTRICT self_data,
const T* ET_RESTRICT mat1_data,
const T* ET_RESTRICT mat2_data,
int64_t m,
int64_t n,
int64_t p,
Expand Down Expand Up @@ -195,7 +196,7 @@ template <
typename checkU = typename std::enable_if<
std::is_same<float, typename std::remove_cv<U>::type>::value ||
std::is_same<double, typename std::remove_cv<U>::type>::value>::type>
inline void vec_softmax(T* __restrict__ y, const U* __restrict__ x, int n) {
inline void vec_softmax(T* ET_RESTRICT y, const U* ET_RESTRICT x, int n) {
U max_x = *std::max_element(x, x + n);
T sum = 0;

Expand Down Expand Up @@ -223,8 +224,8 @@ constexpr const T& clamp(const T& v, const T& lo, const T& hi) {
/// Quantizes the elements of `x` into `y`, both of which must have `size`
/// elements. Inverse of `dequantize_i8_f32()`.
inline void quantize_i8_f32(
int8_t* __restrict__ y,
const float* __restrict__ x,
int8_t* ET_RESTRICT y,
const float* ET_RESTRICT x,
float scale,
int32_t zero_point,
size_t size) {
Expand All @@ -237,8 +238,8 @@ inline void quantize_i8_f32(
/// Dequantizes the elements of `x` into `y`, both of which must have `size`
/// elements. Inverse of `quantize_i8_f32()`.
inline void dequantize_i8_f32(
float* __restrict__ y,
const int8_t* __restrict__ x,
float* ET_RESTRICT y,
const int8_t* ET_RESTRICT x,
float scale,
int32_t zero_point,
size_t size) {
Expand Down
7 changes: 7 additions & 0 deletions runtime/platform/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,13 @@
#define ET_INLINE __attribute__((always_inline)) inline
#endif

// Restrict
#if defined(_MSC_VER)
#define ET_RESTRICT __restrict
#else
#define ET_RESTRICT __restrict__
#endif

#if defined(__GNUC__)

#define ET_UNREACHABLE() __builtin_unreachable()
Expand Down
Loading