sgl-project
diff --git a/‎include/sgl_kernel_ops.h‎
100755100644
Lines changed: 3 additions & 4 deletions b/‎include/sgl_kernel_ops.h‎
100755100644
Lines changed: 3 additions & 4 deletions
diff --git a/‎python/sgl_kernel/elementwise.py‎
Lines changed: 3 additions & 3 deletions b/‎python/sgl_kernel/elementwise.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/sycl/TripleOps.cpp‎
Lines changed: 246 additions & 0 deletions b/‎src/sycl/TripleOps.cpp‎
Lines changed: 246 additions & 0 deletions
@@ -22,7 +22,6 @@ limitations under the License.
 #include <torch/torch.h>
 
 #include <sycl/sycl.hpp>
-
 #include <tuple>
 #include <vector>
 
@@ -123,9 +122,9 @@ void sgl_fused_add_rmsnorm(
     torch::Tensor input, torch::Tensor residual, torch::Tensor weight, double eps, bool enable_pdl);
 void gemma_rmsnorm(at::Tensor& output, at::Tensor& input, at::Tensor& weight, double eps, bool enable_pdl);
 void gemma_fused_add_rmsnorm(at::Tensor& input, at::Tensor& residual, at::Tensor& weight, double eps, bool enable_pdl);
-void silu_and_mul(at::Tensor& out, at::Tensor& input, int64_t sycl_stream);
-void gelu_tanh_and_mul(at::Tensor& out, at::Tensor& input, int64_t sycl_stream);
-void gelu_and_mul(at::Tensor& out, at::Tensor& input, int64_t sycl_stream);
+void silu_and_mul(at::Tensor& out, at::Tensor& input);
+void gelu_tanh_and_mul(at::Tensor& out, at::Tensor& input);
+void gelu_and_mul(at::Tensor& out, at::Tensor& input);
 void apply_rope_pos_ids_cos_sin_cache(
     at::Tensor q,
     at::Tensor k,
 
@@ -179,7 +179,7 @@ def silu_and_mul(input: torch.Tensor, out: torch.Tensor = None) -> torch.Tensor:
             device=input.device,
             dtype=input.dtype,
         )
-    torch.ops.sgl_kernel.silu_and_mul.default(out, input, get_cuda_stream())
+    torch.ops.sgl_kernel.silu_and_mul.default(out, input)
     return out
 
 
@@ -194,7 +194,7 @@ def gelu_tanh_and_mul(input: torch.Tensor, out: torch.Tensor = None) -> torch.Te
             device=input.device,
             dtype=input.dtype,
         )
-    torch.ops.sgl_kernel.gelu_tanh_and_mul.default(out, input, get_cuda_stream())
+    torch.ops.sgl_kernel.gelu_tanh_and_mul.default(out, input)
     return out
 
 
@@ -209,7 +209,7 @@ def gelu_and_mul(input: torch.Tensor, out: torch.Tensor = None) -> torch.Tensor:
             device=input.device,
             dtype=input.dtype,
         )
-    torch.ops.sgl_kernel.gelu_and_mul.default(out, input, get_cuda_stream())
+    torch.ops.sgl_kernel.gelu_and_mul.default(out, input)
     return out
 
 
 
@@ -0,0 +1,246 @@
+#include <ATen/ATen.h>
+#include <ATen/OpMathType.h>
+#include <ATen/Parallel.h>
+#include <c10/xpu/XPUStream.h>
+#include <torch/all.h>
+
+#include <cmath>
+#include <cstdint>
+#include <iostream>
+#include <sycl/sycl.hpp>
+#include <vector>
+
+#include "SYCLHelpers.h"
+#include "Utils.h"
+
+#define DPCPP_CONSTANT __attribute__((opencl_constant))
+
+#define DPCPP_KER_STRING(var, str) static const DPCPP_CONSTANT char var[] = str;
+#define DPCPP_KER_PRINTF sycl::ext::oneapi::experimental::printf
+
+#define DPCPP_K_PRINT(fmt_str, ...)           \
+  {                                           \
+    DPCPP_KER_STRING(fmt_var, fmt_str);       \
+    DPCPP_KER_PRINTF(fmt_var, ##__VA_ARGS__); \
+  }
+
+template <typename scalar_t, int vec_size>
+struct alignas(sizeof(scalar_t) * vec_size) aligned_vector_loop {
+  scalar_t val[vec_size];
+
+  scalar_t& operator[](int index) {
+    return val[index];
+  }
+
+  scalar_t const& operator[](int index) const {
+    return val[index];
+  }
+};
+
+template <typename scalar_t, typename accscalar_t>
+struct silu_mul_dpcpp_functor {
+  scalar_t operator()(scalar_t a, scalar_t b) const {
+    return (accscalar_t(a)) / (1.0f + expf(accscalar_t(-a))) * accscalar_t(b);
+  }
+};
+
+template <typename scalar_t, typename accscalar_t>
+struct gelu_tanh_mul_dpcpp_functor {
+  scalar_t operator()(scalar_t a, scalar_t b) const {
+    const accscalar_t kBeta = M_SQRT2 * M_2_SQRTPI * accscalar_t(0.5);
+    const accscalar_t kKappa = 0.044715;
+    auto x_cube = accscalar_t(a) * accscalar_t(a) * accscalar_t(a);
+    auto inner = kBeta * (accscalar_t(a) + kKappa * x_cube);
+    return (accscalar_t(0.5) * accscalar_t(a) * (accscalar_t(1) + std::tanh(accscalar_t(inner)))) * accscalar_t(b);
+  }
+};
+
+template <typename scalar_t, typename accscalar_t>
+struct gelu_erf_mul_dpcpp_functor {
+  scalar_t operator()(scalar_t a, scalar_t b) const {
+    return (accscalar_t(a) * accscalar_t(0.5) * (accscalar_t(1) + ::erf(accscalar_t(a) * accscalar_t(M_SQRT1_2)))) *
+           accscalar_t(b);
+  }
+};
+
+template <typename scalar_t, typename func_t, int N>
+struct op_and_mul_functor {
+  void operator()(sycl::nd_item<1> item) const {
+    using accscalar_t = at::opmath_type<scalar_t>;
+    int64_t offset = item.get_local_linear_id();
+    int64_t step = item.get_local_range(0);
+    int64_t token_id = item.get_group(0);
+    func_t fn;
+    int64_t bound = dim / N;
+    for (int64_t i = offset; i < bound; i += step) {
+      auto unary_val = reinterpret_cast<aligned_vector_loop<scalar_t, N>*>(input_ptr)[token_id * bound * 2 + i];
+      auto mul_val = reinterpret_cast<aligned_vector_loop<scalar_t, N>*>(input_ptr)[token_id * bound * 2 + i + bound];
+#pragma unroll
+      for (int i = 0; i < N; ++i) {
+        auto a = unary_val[i], b = mul_val[i];
+        unary_val[i] = fn(unary_val[i], mul_val[i]);
+      }
+      reinterpret_cast<aligned_vector_loop<scalar_t, N>*>(output_ptr)[token_id * bound + i] = unary_val;
+    }
+  }
+
+  scalar_t* input_ptr;
+  scalar_t* output_ptr;
+  int64_t num_;
+  int64_t dim;
+};
+
+#define VEC_LAUNCH(KERNEL, N)                                                \
+  case N: {                                                                  \
+    op_and_mul_functor<T_to, KERNEL<T_to, accscalar_t>, N> kfn = {           \
+        .input_ptr = _input, .output_ptr = _out, .num_ = numel, .dim = dim}; \
+    sycl_kernel_submit(num_group* wg_size, wg_size, q, kfn);                 \
+    break;                                                                   \
+  }
+
+template <typename T = float>
+void get_config(
+    const Tensor& input,
+    const Tensor& out,
+    int64_t& numel,
+    int64_t& dim,
+    int64_t& wg_size,
+    int64_t& num_group,
+    int& vec_size) {
+  auto dev_id = dpcppGetDeviceIdOfCurrentQueue();
+  int64_t max_wg_size = dpcppMaxWorkGroupSize(dev_id);
+  numel = out.numel();
+  dim = out.size(-1);
+  int64_t tokens = numel / dim;
+  wg_size = std::min(dim, max_wg_size);
+  num_group = tokens;
+
+  vec_size = sizeof(float) * 4 / sizeof(T);
+  while ((vec_size >> 1) * wg_size >= dim) {
+    vec_size = vec_size >> 1;
+  }
+  if (dim % vec_size != 0) vec_size = 1;
+}
+
+template <typename T_to = float, typename T_from = float>
+void silu_and_mul_sycl(sycl::queue& q, Tensor& input, Tensor& out) {
+  auto _input = reinterpret_cast<T_to*>(input.data_ptr<T_from>());
+  auto _out = reinterpret_cast<T_to*>(out.data_ptr<T_from>());
+
+  int64_t numel;
+  int64_t dim;
+  int64_t wg_size;
+  int64_t num_group;
+  int vec_size;
+  get_config<T_to>(input, out, numel, dim, wg_size, num_group, vec_size);
+
+  using accscalar_t = at::opmath_type<T_to>;
+  switch (vec_size) {
+    VEC_LAUNCH(silu_mul_dpcpp_functor, 1);
+    VEC_LAUNCH(silu_mul_dpcpp_functor, 2);
+    VEC_LAUNCH(silu_mul_dpcpp_functor, 4);
+    VEC_LAUNCH(silu_mul_dpcpp_functor, 8);
+    VEC_LAUNCH(silu_mul_dpcpp_functor, 16);
+    default:
+      TORCH_CHECK(false, "Unsupported vector size: ", vec_size);
+  }
+
+  return;
+}
+
+void silu_and_mul(Tensor& out, Tensor& input) {
+  input = input.contiguous();
+  out = out.contiguous();
+
+  auto stream = at::xpu::getCurrentXPUStream();
+  auto queue = stream.queue();
+
+  if (input.scalar_type() == at::ScalarType::Half) {
+    silu_and_mul_sycl<sycl::half, at::Half>(queue, input, out);
+  } else {
+    silu_and_mul_sycl<sycl::ext::oneapi::bfloat16, at::BFloat16>(queue, input, out);
+  }
+  return;
+}
+
+template <typename T_to = float, typename T_from = float>
+void gelu_tanh_and_mul_sycl(sycl::queue& q, Tensor& input, Tensor& out) {
+  auto _input = reinterpret_cast<T_to*>(input.data_ptr<T_from>());
+  auto _out = reinterpret_cast<T_to*>(out.data_ptr<T_from>());
+
+  int64_t numel;
+  int64_t dim;
+  int64_t wg_size;
+  int64_t num_group;
+  int vec_size;
+  get_config<T_to>(input, out, numel, dim, wg_size, num_group, vec_size);
+
+  using accscalar_t = at::opmath_type<T_to>;
+  switch (vec_size) {
+    VEC_LAUNCH(gelu_tanh_mul_dpcpp_functor, 1);
+    VEC_LAUNCH(gelu_tanh_mul_dpcpp_functor, 2);
+    VEC_LAUNCH(gelu_tanh_mul_dpcpp_functor, 4);
+    VEC_LAUNCH(gelu_tanh_mul_dpcpp_functor, 8);
+    VEC_LAUNCH(gelu_tanh_mul_dpcpp_functor, 16);
+    default:
+      TORCH_CHECK(false, "Unsupported vector size: ", vec_size);
+  }
+
+  return;
+}
+
+void gelu_tanh_and_mul(Tensor& out, Tensor& input) {
+  input = input.contiguous();
+  out = out.contiguous();
+
+  auto stream = at::xpu::getCurrentXPUStream();
+  auto queue = stream.queue();
+
+  if (input.scalar_type() == at::ScalarType::Half) {
+    gelu_tanh_and_mul_sycl<sycl::half, at::Half>(queue, input, out);
+  } else {
+    gelu_tanh_and_mul_sycl<sycl::ext::oneapi::bfloat16, at::BFloat16>(queue, input, out);
+  }
+  return;
+}
+
+template <typename T_to = float, typename T_from = float>
+void gelu_and_mul_sycl(sycl::queue& q, Tensor& input, Tensor& out) {
+  auto _input = reinterpret_cast<T_to*>(input.data_ptr<T_from>());
+  auto _out = reinterpret_cast<T_to*>(out.data_ptr<T_from>());
+
+  int64_t numel;
+  int64_t dim;
+  int64_t wg_size;
+  int64_t num_group;
+  int vec_size;
+  get_config<T_to>(input, out, numel, dim, wg_size, num_group, vec_size);
+
+  using accscalar_t = at::opmath_type<T_to>;
+  switch (vec_size) {
+    VEC_LAUNCH(gelu_erf_mul_dpcpp_functor, 1);
+    VEC_LAUNCH(gelu_erf_mul_dpcpp_functor, 2);
+    VEC_LAUNCH(gelu_erf_mul_dpcpp_functor, 4);
+    VEC_LAUNCH(gelu_erf_mul_dpcpp_functor, 8);
+    VEC_LAUNCH(gelu_erf_mul_dpcpp_functor, 16);
+    default:
+      TORCH_CHECK(false, "Unsupported vector size: ", vec_size);
+  }
+
+  return;
+}
+
+void gelu_and_mul(Tensor& out, Tensor& input) {
+  input = input.contiguous();
+  out = out.contiguous();
+
+  auto stream = at::xpu::getCurrentXPUStream();
+  auto queue = stream.queue();
+
+  if (input.scalar_type() == at::ScalarType::Half) {
+    gelu_and_mul_sycl<sycl::half, at::Half>(queue, input, out);
+  } else {
+    gelu_and_mul_sycl<sycl::ext::oneapi::bfloat16, at::BFloat16>(queue, input, out);
+  }
+  return;
+}
Original file line number	Diff line number	Diff line change
`@@ -179,7 +179,7 @@ def silu_and_mul(input: torch.Tensor, out: torch.Tensor = None) -> torch.Tensor:`
`179`	`179`	`device=input.device,`
`180`	`180`	`dtype=input.dtype,`
`181`	`181`	`)`
`182`		`- torch.ops.sgl_kernel.silu_and_mul.default(out, input, get_cuda_stream())`
	`182`	`+ torch.ops.sgl_kernel.silu_and_mul.default(out, input)`
`183`	`183`	`return out`
`184`	`184`
`185`	`185`
`@@ -194,7 +194,7 @@ def gelu_tanh_and_mul(input: torch.Tensor, out: torch.Tensor = None) -> torch.Te`
`194`	`194`	`device=input.device,`
`195`	`195`	`dtype=input.dtype,`
`196`	`196`	`)`
`197`		`- torch.ops.sgl_kernel.gelu_tanh_and_mul.default(out, input, get_cuda_stream())`
	`197`	`+ torch.ops.sgl_kernel.gelu_tanh_and_mul.default(out, input)`
`198`	`198`	`return out`
`199`	`199`
`200`	`200`
`@@ -209,7 +209,7 @@ def gelu_and_mul(input: torch.Tensor, out: torch.Tensor = None) -> torch.Tensor:`
`209`	`209`	`device=input.device,`
`210`	`210`	`dtype=input.dtype,`
`211`	`211`	`)`
`212`		`- torch.ops.sgl_kernel.gelu_and_mul.default(out, input, get_cuda_stream())`
	`212`	`+ torch.ops.sgl_kernel.gelu_and_mul.default(out, input)`
`213`	`213`	`return out`
`214`	`214`
`215`	`215`