Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions src/ATen/native/xpu/sycl/BatchNormKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -423,7 +423,7 @@ template <
typename index_t>
struct BatchNormCollectStatisticsKernelFunctor
: public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<2> item) const {
int plane = item.get_group(1);
int tid = item.get_local_linear_id();
Expand Down Expand Up @@ -1874,7 +1874,7 @@ template <
typename index_t>
struct BatchNormBackwardReduceKernelFunctor
: public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<2> item) const {
index_t plane = item.get_group(1);

Expand Down Expand Up @@ -4162,7 +4162,7 @@ template <
typename stat_accscalar_t,
typename index_t>
struct BatchNormBackwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<2> item) const {
index_t plane = item.get_group(1);
index_t N = grad_output_.size(0) * grad_output_.size(2);
Expand Down Expand Up @@ -4370,7 +4370,7 @@ template <
typename index_t>
struct BatchNormBackwardVectorizedKernelFunctor
: public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<2> item) const {
index_t plane = item.get_group(1);
index_t N = grad_output_.size(0) * grad_output_.size(2);
Expand Down
2 changes: 1 addition & 1 deletion src/ATen/native/xpu/sycl/Dequant_int4.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ struct DequantInt4KernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
weight_dequant(weight_dequant) {}

void sycl_ker_config_convention(sycl::handler& cgh) {}
[[intel::reqd_sub_group_size(SgSize)]] void operator()(
[[sycl::reqd_sub_group_size(SgSize)]] void operator()(
sycl::nd_item<1> it) const {
int constexpr GroupN = TileN;
int constexpr GroupK = SgSize * TileK;
Expand Down
10 changes: 4 additions & 6 deletions src/ATen/native/xpu/sycl/ForeachReduceKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,7 @@ template <
int r_args_depth = 1,
int res_arg_index = 0>
struct LpNormFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
template <typename TLA, typename TLW>
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
template <typename TLA, typename TLW> void operator()(
const int64_t chunk_size,
TLA tlAddress,
TLW tlWGMeta,
Expand Down Expand Up @@ -117,7 +116,7 @@ struct LpNormFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {

template <typename out_t, NormType norm_type, typename opmath_t, int SIMD>
struct lpnormChunkReduceKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<1> item_id) const {
auto lid = item_id.get_local_linear_id();
auto group_id = item_id.get_group(0);
Expand Down Expand Up @@ -481,8 +480,7 @@ std::vector<Tensor> foreach_norm_kernel(

template <typename T, int SIMD>
struct LpMaxFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
template <typename TLA, typename TLW>
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
template <typename TLA, typename TLW> void operator()(
int64_t chunk_size,
TLA tlAddressMeta,
TLW tlWGMeta,
Expand Down Expand Up @@ -555,7 +553,7 @@ struct LpMaxFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {

template <typename T, int SIMD>
struct LpmaxChunkReduceKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<1> item_id) const {
auto local_range = item_id.get_local_range(0);
auto lid = item_id.get_local_linear_id();
Expand Down
16 changes: 8 additions & 8 deletions src/ATen/native/xpu/sycl/GroupNormKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ struct GNRowwiseMomentsFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
using WelfordOp =
WelfordOpsXPU<T_ACC, T_ACC, int64_t, std::pair<T_ACC, T_ACC>>;

[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<1> item) const {
const int64_t i = item.get_group(0);
WelfordOp welford_op = {/*correction=*/0, /*take_sqrt=*/false, item};
Expand Down Expand Up @@ -114,7 +114,7 @@ struct GNRowwiseMomentsVectorizedFunctor
WelfordOpsXPU<T_ACC, T_ACC, int64_t, std::pair<T_ACC, T_ACC>>;
using vec_t = memory::aligned_vector<T, VEC_SIZE>;

[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<1> item) const {
WelfordType val[VEC_SIZE];
WelfordOp welford_op = {/*correction=*/0, /*take_sqrt=*/false, item};
Expand Down Expand Up @@ -476,7 +476,7 @@ void group_norm_kernel(
template <typename T, typename T_ACC, int SIMD>
struct Compute1dBackwardFusedParamsFunctor
: public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<2> item) const {
const int64_t G = group_;
const int64_t D = C_ / G;
Expand Down Expand Up @@ -630,7 +630,7 @@ template <typename T, int SIMD, int kReduceTileSize>
struct GammaBeta1dBackwardLargeKernel : public __SYCL_KER_CONFIG_CONVENTION__ {
using T_ACC = acc_type_device<T, kXPU>;

[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<2> item) const {
const int64_t c =
item.get_group(1) * item.get_local_range(1) + item.get_local_id(1);
Expand Down Expand Up @@ -890,7 +890,7 @@ template <typename T, int SIMD>
struct ComputeInternalGradientsFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
using T_ACC = acc_type_device<T, kXPU>;

[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<1> item) const {
const int64_t nc = item.get_group(0);
T_ACC sum1 = 0;
Expand Down Expand Up @@ -941,7 +941,7 @@ struct ComputeInternalGradientsVectorizedFunctor
using vec_t = memory::aligned_vector<T, VEC_SIZE>;
using acc_vec_t = memory::aligned_vector<T_ACC, VEC_SIZE>;

[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<1> item) const {
acc_vec_t sum1_vec;
acc_vec_t sum2_vec;
Expand Down Expand Up @@ -1038,7 +1038,7 @@ struct ComputeBackwardFusedParamsFunctor
: public __SYCL_KER_CONFIG_CONVENTION__ {
using T_ACC = acc_type_device<T, kXPU>;

[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<2> item) const {
const int64_t G = group_;
const int64_t D = C_ / G;
Expand Down Expand Up @@ -1176,7 +1176,7 @@ template <typename T, int SIMD, int kReduceTileSize>
struct GammaBetaBackwardFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
using T_ACC = acc_type_device<T, kXPU>;

[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<2> item) const {
auto group_x = item.get_group(1);
auto group_size_x = item.get_local_range(1);
Expand Down
2 changes: 1 addition & 1 deletion src/ATen/native/xpu/sycl/IndexKernelUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ inline bool fast_gather_kernel_eligible(

template <int Alignment, typename index_t>
struct VectorizedGatherKernel {
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<2> item) const {
int64_t ind = idx_[item.get_group(1)];
if (allow_neg_indices_) {
Expand Down
4 changes: 2 additions & 2 deletions src/ATen/native/xpu/sycl/LayerNormKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ struct RowwiseMomentsFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
using WelfordType = WelfordData<T_ACC, int64_t>;
using WelfordOp = WelfordOps<T_ACC, T_ACC, int64_t, std::pair<T_ACC, T_ACC>>;

[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<1> item_id) const {
const int64_t i = item_id.get_group(0);
WelfordOp welford_op = {/*correction=*/0, /*take_sqrt=*/false};
Expand Down Expand Up @@ -435,7 +435,7 @@ WelfordDataLN compute_stats(
template <typename T, typename T_ACC>
struct VectorizedLayerNormKernelFunctor
: public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<2> item_id) const {
auto i1 = item_id.get_group(1);
const T* block_row = X_ + i1 * N_;
Expand Down
2 changes: 1 addition & 1 deletion src/ATen/native/xpu/sycl/LinearInt4.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ struct LinearInt4KernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
ldc(ldc) {}
void sycl_ker_config_convention(sycl::handler& cgh) {}

[[intel::reqd_sub_group_size(16)]] void operator()(
[[sycl::reqd_sub_group_size(16)]] void operator()(
sycl::nd_item<1> it) const {
int constexpr Unroll = 2;
int constexpr SgSize = 16;
Expand Down
2 changes: 1 addition & 1 deletion src/ATen/native/xpu/sycl/LossNLL2dKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ struct NllLoss2dForwardNoReduceKernelFunctor {

template <typename scalar_t, typename accscalar_t, typename index_t, int SIMD>
struct NllLoss2dForwardKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<1> item) const {
scalar_t cur_weight;
accscalar_t input_sum = 0;
Expand Down
4 changes: 2 additions & 2 deletions src/ATen/native/xpu/sycl/MultiLabelMarginLossKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ void multilabel_margin_loss_shape_check(
template <typename scalar_t, typename accscalar_t>
struct MultilabelMarginLossForwardKernelFunctor
: public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(MULTILABELMARGIN_SUB_GROUP_SIZE)]] void
[[sycl::reqd_sub_group_size(MULTILABELMARGIN_SUB_GROUP_SIZE)]] void
operator()(sycl::nd_item<1> item) const {
int k = item.get_group(0);
const scalar_t* input_k = input_ + k * dim_;
Expand Down Expand Up @@ -148,7 +148,7 @@ struct MultilabelMarginLossForwardKernelFunctor
template <typename scalar_t, typename accscalar_t>
struct MultilabelMarginLossBackwardKernelFunctor
: public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(MULTILABELMARGIN_SUB_GROUP_SIZE)]] void
[[sycl::reqd_sub_group_size(MULTILABELMARGIN_SUB_GROUP_SIZE)]] void
operator()(sycl::nd_item<1> item) const {
int k = item.get_group(0);
const scalar_t* input_k = input_ + k * dim_;
Expand Down
4 changes: 2 additions & 2 deletions src/ATen/native/xpu/sycl/Norm.h
Original file line number Diff line number Diff line change
Expand Up @@ -600,7 +600,7 @@ template <
class Norm,
bool one_moment = false>
struct FusedNormKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<3> item_id) const {
accscalar_t sum1 = 0;
accscalar_t sum2 = 0;
Expand Down Expand Up @@ -747,7 +747,7 @@ template <
class Norm,
bool one_moment = false>
struct RowwiseMomentsKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<3> item_id) const {
index_t local_id = item_id.get_local_id(2);

Expand Down
4 changes: 2 additions & 2 deletions src/ATen/native/xpu/sycl/SoftMaxKernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ template <
bool is_same_dtype>
struct DispatchSoftmaxForwardKernelFunctor
: public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<1> item) const {
if (local_size_ == 1 && item.get_global_id(0) >= outer_size_)
return;
Expand Down Expand Up @@ -933,7 +933,7 @@ template <
bool is_same_dtype = false>
struct DispatchSoftmaxBackwardKernelFunctor
: public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(SIMD)]] void operator()(
[[sycl::reqd_sub_group_size(SIMD)]] void operator()(
sycl::nd_item<1> item) const {
if (local_size_ == 1 && item.get_global_id(0) >= outer_size_)
return;
Expand Down
10 changes: 5 additions & 5 deletions src/ATen/native/xpu/sycl/SortingKernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ namespace xpu {
template <typename method_t, typename key_t, typename value_t>
struct SegmentedGroupRadixSortPairsFunctor
: public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()(
[[sycl::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()(
sycl::nd_item<1> item) const {
int seg_idx = item.get_group(0);
int seg_offset = seg_idx * num_elements_;
Expand Down Expand Up @@ -96,7 +96,7 @@ void segmented_group_radix_sort_pairs_kernel(
template <typename method_t, typename key_t, typename value_t>
struct SegmentedRadixSortPairsUpsweepFunctor
: public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()(
[[sycl::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()(
sycl::nd_item<1> item) const {
int num_tiles = (num_elements_ + method_t::PROCESSING_LENGTH - 1) /
method_t::PROCESSING_LENGTH;
Expand Down Expand Up @@ -179,7 +179,7 @@ void segmented_radix_sort_pairs_upsweep_kernel(
template <typename method_t>
struct SegmentedRadixSortPairsScanFunctor
: public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()(
[[sycl::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()(
sycl::nd_item<1> item) const {
constexpr int RADIX_BUCKETS = 16;
int seg_idx = item.get_group(0);
Expand Down Expand Up @@ -218,7 +218,7 @@ void segmented_radix_sort_pairs_scan_kernel(
template <typename method_t, typename key_t, typename value_t>
struct SegmentedRadixSortPairsDownsweepFunctor
: public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()(
[[sycl::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()(
sycl::nd_item<1> item) const {
int num_tiles = (num_elements_ + method_t::PROCESSING_LENGTH - 1) /
method_t::PROCESSING_LENGTH;
Expand Down Expand Up @@ -448,7 +448,7 @@ struct SegmentedGroupRadixSelectPairsFunctor
MAX_KV_BYTES = std::max(sizeof(key_t), sizeof(value_t)),
};

[[intel::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()(
[[sycl::reqd_sub_group_size(method_t::SUBGROUP_SIZE)]] void operator()(
sycl::nd_item<1> item) const {
int seg_idx = item.get_group(0);
int seg_offset = seg_idx * nelements_;
Expand Down
2 changes: 1 addition & 1 deletion src/ATen/native/xpu/sycl/TensorModeKernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ inline T reduceGroupWithNThreadLocalReductions(

template <typename T, unsigned int Power2Size>
struct ComputeModeKernelFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
[[intel::reqd_sub_group_size(32)]] void operator()(
[[sycl::reqd_sub_group_size(32)]] void operator()(
sycl::nd_item<3> item) const {
int tidx = item.get_local_id(2);
int stidx = item.get_local_range(2) +
Expand Down
Loading