jd-opensource
diff --git a/‎xllm/core/kernels/mlu/mlu_ops_api.h‎
Lines changed: 33 additions & 0 deletions b/‎xllm/core/kernels/mlu/mlu_ops_api.h‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎xllm/core/kernels/mlu/scaled_matmul.cpp‎
Lines changed: 113 additions & 0 deletions b/‎xllm/core/kernels/mlu/scaled_matmul.cpp‎
Lines changed: 113 additions & 0 deletions
diff --git a/‎xllm/core/kernels/mlu/scaled_quantize.cpp‎
Lines changed: 93 additions & 0 deletions b/‎xllm/core/kernels/mlu/scaled_quantize.cpp‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎xllm/core/kernels/ops_api.cpp‎
Lines changed: 43 additions & 0 deletions b/‎xllm/core/kernels/ops_api.cpp‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎xllm/core/kernels/ops_api.h‎
Lines changed: 5 additions & 0 deletions b/‎xllm/core/kernels/ops_api.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎xllm/core/kernels/param.h‎
Lines changed: 36 additions & 0 deletions b/‎xllm/core/kernels/param.h‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎xllm/core/layers/common/CMakeLists.txt‎
Lines changed: 17 additions & 0 deletions b/‎xllm/core/layers/common/CMakeLists.txt‎
Lines changed: 17 additions & 0 deletions
@@ -154,4 +154,37 @@ torch::Tensor fused_moe(
     int shared_expert_num,
     const std::string& parallel_mode);
 
+std::tuple<torch::Tensor, torch::Tensor> scaled_quantize(
+    const torch::Tensor& x,
+    const torch::Tensor& smooth,
+    const std::optional<torch::Tensor>& zero = std::nullopt,
+    const std::optional<torch::Tensor>& token_count = std::nullopt,
+    const std::optional<torch::Tensor>& gather_index = std::nullopt,
+    const std::optional<torch::Tensor>& gather_index_start_position =
+        std::nullopt,
+    const std::optional<torch::Tensor>& output = std::nullopt,
+    const std::optional<torch::Tensor>& output_scale = std::nullopt,
+    const std::string& act_mode = "none",
+    double active_coef = 1.0,
+    bool is_gated = false,
+    at::ScalarType quant_type = at::kChar);
+
+torch::Tensor scaled_matmul(
+    const torch::Tensor& a,
+    const torch::Tensor& b,
+    const std::optional<torch::Tensor>& a_scale,
+    const torch::Tensor& b_scale,
+    c10::ScalarType output_dtype,
+    const std::optional<torch::Tensor>& bias = std::nullopt,
+    const std::optional<torch::Tensor>& c = std::nullopt,
+    const std::string& act_mode = "none",
+    int64_t quant_bit_size = 8,
+    double alpha = 1.0,
+    double beta = 1.0,
+    bool use_hp_active = false,
+    int64_t a_quant_bit_size = -1,
+    const std::optional<torch::Tensor>& a_calib = std::nullopt,
+    const std::optional<torch::Tensor>& b_calib = std::nullopt,
+    const std::optional<torch::Tensor>& output = std::nullopt);
+
 }  // namespace xllm::kernel::mlu
@@ -0,0 +1,113 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlu_ops_api.h"
+
+namespace xllm::kernel::mlu {
+
+torch::Tensor scaled_matmul(
+    const torch::Tensor& a,
+    const torch::Tensor& b,
+    const std::optional<torch::Tensor>& a_scale,
+    const torch::Tensor& b_scale,
+    c10::ScalarType output_dtype,
+    const std::optional<torch::Tensor>& bias /* = c10::nullopt */,
+    const std::optional<torch::Tensor>& c /* = c10::nullopt */,
+    const std::string& act_mode /* = "none" */,
+    int64_t quant_bit_size /* = 8 */,
+    double alpha /* = 1.0 */,
+    double beta /* = 1.0 */,
+    bool use_hp_active /* = false */,
+    int64_t a_quant_bit_size /* = -1 */,
+    const std::optional<torch::Tensor>& a_calib /* = c10::nullopt */,
+    const std::optional<torch::Tensor>& b_calib /* = c10::nullopt */,
+    const std::optional<torch::Tensor>& output /* = c10::nullopt */
+) {
+  // Check: only support w8a8 quantization for now.
+  TORCH_CHECK(quant_bit_size == 8 && a_quant_bit_size == 8,
+              "scaled_matmul only supports w8a8 quantization (quant_bit_size "
+              "== 8, a_quant_bit_size == 8) for now. "
+              "Got quant_bit_size = ",
+              quant_bit_size,
+              ", a_quant_bit_size = ",
+              a_quant_bit_size,
+              ".");
+
+  // Only support smooth_quant algorithm for now
+  std::string quant_algo = "smooth_quant";
+  std::string a_quant_layout = (a_scale.value().dim() == 1)
+                                   ? "quantize_per_token"
+                                   : "quantize_group_wise";
+  std::string b_quant_layout = "quantize_per_channel";
+  if (b_scale.dim() > 1) {
+    if (b_scale.size(0) < b.size(0)) {
+      b_quant_layout = "quantize_per_block";
+    } else {
+      b_quant_layout = "quantize_group_wise";
+    }
+  }
+  std::optional<torch::Tensor> gemm_output_scale = c10::nullopt;
+
+  at::ScalarType torch_half = at::ScalarType::Half;
+  at::ScalarType torch_bfloat16 = at::ScalarType::BFloat16;
+
+  TORCH_CHECK(output_dtype == torch_half || output_dtype == torch_bfloat16,
+              "output dtype must be half or bfloat16, but got: ",
+              output_dtype,
+              ".");
+
+  // Select output tensor
+  torch::Tensor output_tensor;
+  if (output.has_value()) {
+    output_tensor = output.value();
+  } else {
+    output_tensor = at::empty(
+        {a.size(0), b.size(0)},
+        torch::TensorOptions().dtype(output_dtype).device(a.device()));
+  }
+
+  // Call underlying kernel for smooth_quant
+  tmo::torch_api::scaled_matmul(output_tensor,
+                                a,
+                                b,
+                                a_scale,
+                                c10::nullopt,  // a_zero
+                                a_calib,
+                                b_scale,
+                                c10::nullopt,  // b_zero
+                                b_calib,
+                                bias,
+                                c,
+                                c10::nullopt,  // c_scale
+                                c10::nullopt,  // c_zero
+                                gemm_output_scale,
+                                c10::nullopt,  // gemm_output_zero
+                                quant_algo,
+                                a_quant_layout,
+                                b_quant_layout,
+                                a_quant_bit_size,
+                                quant_bit_size,
+                                act_mode,
+                                use_hp_active,
+                                1.0,  // act_coef
+                                alpha,
+                                beta,
+                                false,  // trans_a
+                                true    // trans_b
+  );
+  return output_tensor;
+}
+
+}  // namespace xllm::kernel::mlu
@@ -0,0 +1,93 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlu_ops_api.h"
+
+namespace xllm::kernel::mlu {
+std::tuple<torch::Tensor, torch::Tensor> scaled_quantize(
+    const torch::Tensor& x,
+    const torch::Tensor& smooth,
+    const std::optional<torch::Tensor>& zero /* = c10::nullopt */,
+    const std::optional<torch::Tensor>& token_count /* = c10::nullopt */,
+    const std::optional<torch::Tensor>& gather_index /* = c10::nullopt */,
+    const std::optional<torch::Tensor>&
+        gather_index_start_position /* = c10::nullopt */,
+    const std::optional<torch::Tensor>& output /* = c10::nullopt */,
+    const std::optional<torch::Tensor>& output_scale /* = c10::nullopt */,
+    const std::string& act_mode /* = "none" */,
+    double active_coef /* = 1.0 */,
+    bool is_gated /* = false */,
+    at::ScalarType quant_type /* = at::kChar */
+) {
+  // If act_mode is "none", override is_gated to false
+  bool gated = is_gated;
+  if (act_mode == "none") {
+    gated = false;
+  }
+
+  // Determine output shape
+  auto x_sizes = x.sizes();
+  std::vector<int64_t> output_shape(x_sizes.begin(), x_sizes.end());
+  std::vector<int64_t> output_scale_shape(x_sizes.begin(), x_sizes.end() - 1);
+
+  // Adjust output shape based on gather_index
+  if (gather_index.has_value()) {
+    int64_t output_tokens = gather_index.value().size(0);
+    output_shape[0] = output_tokens;
+    output_scale_shape[0] = output_tokens;
+  }
+
+  // Adjust output shape for gated activation
+  if (gated) {
+    // For gated, output is [..., C//2]
+    output_shape.back() = output_shape.back() / 2;
+  }
+
+  // Allocate output tensors
+  torch::Tensor result_output;
+  torch::Tensor result_output_scale;
+
+  if (output.has_value()) {
+    result_output = output.value();
+  } else {
+    result_output = at::empty(output_shape, x.options().dtype(quant_type));
+  }
+
+  if (output_scale.has_value()) {
+    result_output_scale = output_scale.value();
+  } else {
+    result_output_scale =
+        at::empty(output_scale_shape, x.options().dtype(at::kFloat));
+  }
+
+  // Call underlying MLU kernel
+  tmo::torch_api::scaled_quantize(x,
+                                  result_output,
+                                  result_output_scale,
+                                  smooth,
+                                  zero,
+                                  token_count,
+                                  gather_index,
+                                  gather_index_start_position,
+                                  /*scale_upper_bound*/ c10::nullopt,
+                                  std::string("dynamic_per_token"),
+                                  act_mode,
+                                  active_coef,
+                                  gated);
+
+  return std::make_tuple(result_output, result_output_scale);
+}
+
+}  // namespace xllm::kernel::mlu
@@ -183,5 +183,48 @@ torch::Tensor fused_moe(FusedMoEParams& params) {
   throw std::runtime_error("fused_moe not implemented");
 #endif
 }
+
+std::tuple<torch::Tensor, torch::Tensor> scaled_quantize(
+    ScaledQuantizeParams& params) {
+#if defined(USE_MLU)
+  return mlu::scaled_quantize(params.x,
+                              params.smooth,
+                              params.zero,
+                              params.token_count,
+                              params.gather_index,
+                              params.gather_index_start_position,
+                              params.output,
+                              params.output_scale,
+                              params.act_mode,
+                              params.active_coef,
+                              params.is_gated,
+                              params.quant_type);
+#else
+  throw std::runtime_error("scaled_quantize not implemented");
+#endif
+}
+
+torch::Tensor scaled_matmul(ScaledMatmulParams& params) {
+#if defined(USE_MLU)
+  return mlu::scaled_matmul(params.a,
+                            params.b,
+                            params.a_scale,
+                            params.b_scale,
+                            params.output_dtype,
+                            params.bias,
+                            params.c,
+                            params.act_mode,
+                            params.quant_bit_size,
+                            params.alpha,
+                            params.beta,
+                            params.use_hp_active,
+                            params.a_quant_bit_size,
+                            params.a_calib,
+                            params.b_calib,
+                            params.output);
+#else
+  throw std::runtime_error("scaled_matmul not implemented");
+#endif
+}
 }  // namespace kernel
 }  // namespace xllm
@@ -40,5 +40,10 @@ torch::Tensor matmul(MatmulParams& params);
 
 torch::Tensor fused_moe(FusedMoEParams& params);
 
+std::tuple<torch::Tensor, torch::Tensor> scaled_quantize(
+    ScaledQuantizeParams& params);
+
+torch::Tensor scaled_matmul(ScaledMatmulParams& params);
+
 }  // namespace kernel
 }  // namespace xllm
@@ -170,5 +170,41 @@ struct FusedMoEParams {
   int shared_expert_num = 0;
   std::string parallel_mode = "ep";
 };
+
+// Per token smooth quantize parameters
+struct ScaledQuantizeParams {
+  torch::Tensor x;
+  torch::Tensor smooth;
+  std::optional<torch::Tensor> zero = std::nullopt;
+  std::optional<torch::Tensor> token_count = std::nullopt;
+  std::optional<torch::Tensor> gather_index = std::nullopt;
+  std::optional<torch::Tensor> gather_index_start_position = std::nullopt;
+  std::optional<torch::Tensor> output = std::nullopt;
+  std::optional<torch::Tensor> output_scale = std::nullopt;
+  std::string act_mode = "none";
+  double active_coef = 1.0;
+  bool is_gated = false;
+  torch::ScalarType quant_type = torch::kChar;
+};
+
+// Scaled matmul parameters
+struct ScaledMatmulParams {
+  torch::Tensor a;
+  torch::Tensor b;
+  std::optional<torch::Tensor> a_scale = std::nullopt;
+  torch::Tensor b_scale;
+  torch::ScalarType output_dtype;
+  std::optional<torch::Tensor> bias = std::nullopt;
+  std::optional<torch::Tensor> c = std::nullopt;
+  std::string act_mode = "none";
+  int64_t quant_bit_size = 8;
+  double alpha = 1.0;
+  double beta = 1.0;
+  bool use_hp_active = false;
+  int64_t a_quant_bit_size = -1;
+  std::optional<torch::Tensor> a_calib = std::nullopt;
+  std::optional<torch::Tensor> b_calib = std::nullopt;
+  std::optional<torch::Tensor> output = std::nullopt;
+};
 }  // namespace kernel
 }  // namespace xllm
@@ -40,3 +40,20 @@ cc_library(
     gflags::gflags
     torch
 )
+
+# Add test for DenseMLP
+cc_test(
+  NAME
+    dense_mlp_test
+  SRCS
+    tests/dense_mlp_tests.cpp
+    tests/tests_utils.cpp
+  DEPS
+    :common_layers
+    :parallel_state
+    :model
+    :state_dict
+    glog::glog
+    torch
+    GTest::gtest_main
+)