kvcache-ai · KMSorSMS · Nov 27, 2025 · Nov 11, 2025 · Nov 12, 2025 · Nov 12, 2025
diff --git a/kt-kernel/CMakeLists.txt b/kt-kernel/CMakeLists.txt
@@ -495,7 +495,7 @@ if(NOT DEFINED CLANG_FORMAT_BIN)
     )
 endif()
 if(NOT CLANG_FORMAT_BIN)
-    message(WARNING "clang-format not found. Please install clang-format (>=18) or pass -DCLANG_FORMAT_BIN=/full/path and reconfigure.")
+    message(WARNING "ONLY for developer: clang-format not found. Please install clang-format (>=18) or pass -DCLANG_FORMAT_BIN=/full/path and reconfigure.")
 else()
     execute_process(
         COMMAND ${CLANG_FORMAT_BIN} --version

diff --git a/kt-kernel/CMakePresets.json b/kt-kernel/CMakePresets.json
@@ -39,6 +39,20 @@
         "KTRANSFORMERS_CPU_USE_AMX_AVX512": "ON",
         "KTRANSFORMERS_USE_CUDA": "ON"
       }
+    },
+    {
+      "name": "amd",
+      "displayName": "amd_platform",
+      "description": "for amd platform",
+      "cacheVariables": {
+        "KTRANSFORMERS_CPU_USE_AMX": "OFF",
+        "LLAMA_AVX512": "OFF",
+        "LLAMA_AVX2": "ON",
+        "KTRANSFORMERS_CPU_USE_AMX_AVX512": "OFF",
+        "KTRANSFORMERS_USE_CUDA": "ON",
+        "KTRANSFORMERS_CPU_MOE_AMD": "ON",
+        "KTRANSFORMERS_CPU_MOE_KERNEL": "ON"
+      }
     }
 
   ]

diff --git a/kt-kernel/README.md b/kt-kernel/README.md
@@ -2,41 +2,32 @@
 
 High-performance kernel operations for KTransformers, featuring CPU-optimized MoE inference with AMX, AVX, KML and blis (amd library) support.
 
-- [KT-Kernel](#kt-kernel)
-  - [Note](#note)
-  - [Features](#features)
-  - [Installation](#installation)
-    - [Prerequisites](#prerequisites)
-    - [Quick Installation (Recommended)](#quick-installation-recommended)
-    - [Manual Configuration (Advanced)](#manual-configuration-advanced)
-  - [Verification](#verification)
-  - [Integration with SGLang](#integration-with-sglang)
-    - [Installation Steps](#installation-steps)
-      - [1. Install SGLang](#1-install-sglang)
-      - [2. Prepare Weights](#2-prepare-weights)
-      - [3. Launch SGLang Server](#3-launch-sglang-server)
-    - [Complete Example: Qwen3-30B-A3B](#complete-example-qwen3-30b-a3b)
-      - [Option A: AMX Backend (AMXINT8)](#option-a-amx-backend-amxint8)
-      - [Option B: LLAMAFILE Backend (GGUF)](#option-b-llamafile-backend-gguf)
-    - [KT-Kernel Parameters](#kt-kernel-parameters)
-  - [Direct Python API Usage](#direct-python-api-usage)
-    - [Advanced Options](#advanced-options)
-  - [Build Configuration](#build-configuration)
-    - [Manual Installation](#manual-installation)
-      - [1. Install System Dependencies](#1-install-system-dependencies)
-      - [2. Set Build Configuration](#2-set-build-configuration)
-      - [3. Build and Install](#3-build-and-install)
-  - [Error Troubleshooting](#error-troubleshooting)
-    - [CUDA Not Found](#cuda-not-found)
-    - [hwloc Not Found](#hwloc-not-found)
-  - [Weight Quantization](#weight-quantization)
-  - [Before Commit!](#before-commit)
+- [Note](#note)
+- [Features](#features)
+- [Installation](#installation)
+  - [Prerequisites](#prerequisites)
+  - [Quick Installation (Recommended)](#quick-installation-recommended)
+  - [Manual Configuration (Advanced)](#manual-configuration-advanced)
+- [Verification](#verification)
+- [Integration with SGLang](#integration-with-sglang)
+  - [Installation Steps](#installation-steps)
+  - [Complete Example: Qwen3-30B-A3B](#complete-example-qwen3-30b-a3b)
+  - [KT-Kernel Parameters](#kt-kernel-parameters)
+- [Direct Python API Usage](#direct-python-api-usage)
+  - [Advanced Options](#advanced-options)
+- [Build Configuration](#build-configuration)
+  - [Manual Installation](#manual-installation)
+- [Error Troubleshooting](#error-troubleshooting)
+  - [CUDA Not Found](#cuda-not-found)
+  - [hwloc Not Found](#hwloc-not-found)
+- [Weight Quantization](#weight-quantization)
+- [Before Commit!](#before-commit)
 ## Note
 
 **Current Support Status:**
 - ✅ **Intel CPUs with AMX**: Fully supported (using weights converted to INT4/INT8 format)
 - ✅ **Universal CPU (llamafile backend)**: Supported (using GGUF-format weights)
-- ⚠️ **AMD CPUs with BLIS**: In progress, not yet fully integrated
+- ✅ **AMD CPUs with BLIS**: Supported (for int8 prefill & decode)
 
 ## Features
 
@@ -145,7 +136,7 @@ python scripts/convert_cpu_weights.py \
   --input-path /path/to/model \
   --input-type bf16 \
   --output /path/to/cpu-weights \
-  --quant-method int8  # or int4
+  --quant-method int8  # or int4 or moe_int8 (for amd now) 
 ```
 
 - `--input-path`: Path to GPU-side original weights

diff --git a/kt-kernel/README_zh.md b/kt-kernel/README_zh.md
@@ -2,42 +2,33 @@
 
 高性能 KTransformers 内核库，提供面向 CPU 的高效 MoE 推理内核，支持 AMX 和 AVX 等后端。
 
-- [KT-Kernel](#kt-kernel)
-  - [说明](#说明)
-  - [特性](#特性)
-  - [安装](#安装)
-    - [先决条件](#先决条件)
-    - [快速安装（推荐）](#快速安装推荐)
-    - [手动配置（进阶）](#手动配置进阶)
-  - [验证安装](#验证安装)
-  - [与 SGLang 集成](#与-sglang-集成)
-    - [安装步骤](#安装步骤)
-      - [1. 安装 SGLang](#1-安装-sglang)
-      - [2. 准备权重](#2-准备权重)
-      - [3. 启动 SGLang Server](#3-启动-sglang-server)
-    - [完整示例：Qwen3-30B-A3B](#完整示例qwen3-30b-a3b)
-      - [方案 A：AMX 后端（AMXINT8）](#方案-aamx-后端amxint8)
-      - [方案 B：LLAMAFILE 后端（GGUF）](#方案-bllamafile-后端gguf)
-    - [KT-Kernel 参数](#kt-kernel-参数)
-  - [直接使用 Python API](#直接使用-python-api)
-    - [高级选项](#高级选项)
-  - [构建配置](#构建配置)
-    - [手动安装](#手动安装)
-      - [1. 安装系统依赖](#1-安装系统依赖)
-      - [2. 配置构建参数](#2-配置构建参数)
-      - [3. 构建并安装](#3-构建并安装)
-  - [错误排查](#错误排查)
-    - [找不到 CUDA](#找不到-cuda)
-    - [找不到 hwloc](#找不到-hwloc)
-  - [权重量化](#权重量化)
-  - [提交前必读](#提交前必读)
+- [说明](#说明)
+- [特性](#特性)
+- [安装](#安装)
+  - [先决条件](#先决条件)
+  - [快速安装（推荐）](#快速安装推荐)
+  - [手动配置（进阶）](#手动配置进阶)
+- [验证安装](#验证安装)
+- [与 SGLang 集成](#与-sglang-集成)
+  - [安装步骤](#安装步骤)
+  - [完整示例：Qwen3-30B-A3B](#完整示例qwen3-30b-a3b)
+  - [KT-Kernel 参数](#kt-kernel-参数)
+- [直接使用 Python API](#直接使用-python-api)
+  - [高级选项](#高级选项)
+- [构建配置](#构建配置)
+  - [手动安装](#手动安装)
+- [错误排查](#错误排查)
+  - [找不到 CUDA](#找不到-cuda)
+  - [找不到 hwloc](#找不到-hwloc)
+- [权重量化](#权重量化)
+- [提交前必读](#提交前必读)
 
 ## 说明
 
 **当前支持状态：**
 - ✅ **带 AMX 的 Intel CPU**：已支持（基于转换为 INT4/INT8 格式的权重）
 - ✅ **通用 CPU（llamafile 后端）**：已支持（基于 GGUF 格式的权重）
-- ⚠️ **带 BLIS 的 AMD CPU**：进行中，尚未完全集成
+- ✅ **带 BLIS 的 AMD CPU**：已支持（int8 的 prefill 和 decode）
 
 ## 特性
 
@@ -149,7 +140,7 @@ python scripts/convert_cpu_weights.py \
   --input-path /path/to/model \
   --input-type bf16 \
   --output /path/to/cpu-weights \
-  --quant-method int8  # 或 int4
+  --quant-method int8  # 或 int4 或 moe_int8（用于 amd 的）
 ```
 
 - `--input-path`：GPU 侧原始权重路径

diff --git a/kt-kernel/operators/amx/test/mmq-test.cpp b/kt-kernel/operators/amx/test/mmq-test.cpp
@@ -2376,9 +2376,7 @@ bool ggml_compute_forward_mul_mat_use_amx(struct ggml_tensor* dst) {
   static thread_local bool is_first_time = true;
   if (is_first_time) {
 #pragma omp single
-    {
-      ggml_amx_init();
-    }
+    { ggml_amx_init(); }
 
     // load tile config
     ggml_tile_config_init();

diff --git a/kt-kernel/operators/amx/test/mmq.cpp b/kt-kernel/operators/amx/test/mmq.cpp
@@ -2372,9 +2372,7 @@ bool ggml_compute_forward_mul_mat_use_amx(struct ggml_tensor* dst) {
   static thread_local bool is_first_time = true;
   if (is_first_time) {
 #pragma omp single
-    {
-      ggml_amx_init();
-    }
+    { ggml_amx_init(); }
 
     // load tile config
     ggml_tile_config_init();

diff --git a/kt-kernel/operators/llamafile/mla.hpp b/kt-kernel/operators/llamafile/mla.hpp
@@ -14,15 +14,15 @@
 // #include <utility>
 // #include <vector>
 
-// #define DIRECT_OR_POOL_BY(what, threshold, var, fn)                                                                    \
-//   do {                                                                                                                 \
-//     if ((what) < (threshold)) {                                                                                        \
-//       for (int i = 0; i < (var); i++) {                                                                                \
-//         (fn)(i);                                                                                                       \
-//       }                                                                                                                \
-//     } else {                                                                                                           \
-//       pool->do_work_stealing_job((var), nullptr, (fn), nullptr);                                                       \
-//     }                                                                                                                  \
+// #define DIRECT_OR_POOL_BY(what, threshold, var, fn) \
+//   do { \
+//     if ((what) < (threshold)) { \
+//       for (int i = 0; i < (var); i++) { \
+//         (fn)(i); \
+//       } \
+//     } else { \
+//       pool->do_work_stealing_job((var), nullptr, (fn), nullptr); \
+//     } \
 //   } while (0)
 
 // #define VEC_DOT_TYPE(type) (ggml_internal_get_type_traits((ggml_type)(type)).vec_dot_type)
@@ -31,19 +31,20 @@
 // #define QUANT_OFFSET(ptr, type, n, n_elements) \
 //   (offset_pointer((ptr), (size_t)(n) * QUANT_BLCK_SIZE((n_elements), (type))))
 
-// #define LLAMAFILE_SGEMM_QUANT_FULL_MATMUL(m, n, k, a, a_type, b, b_col, c, c_col)                                      \
-//   do {                                                                                                                 \
-//     llamafile_sgemm((m), (n), QUANT_BLCK_COUNT((k), (a_type)), (a), QUANT_BLCK_COUNT((k), (a_type)),                   \
-//                     QUANT_OFFSET((b), VEC_DOT_TYPE((a_type)), (b_col), (k)),                                           \
-//                     QUANT_BLCK_COUNT((k), VEC_DOT_TYPE((a_type))), offset_pointer((c), (c_col) * (m) * sizeof(float)), \
-//                     (k), 0, 1, GGML_TASK_TYPE_COMPUTE, (a_type), VEC_DOT_TYPE((a_type)), GGML_TYPE_F32,                \
-//                     GGML_PREC_DEFAULT);                                                                                \
+// #define LLAMAFILE_SGEMM_QUANT_FULL_MATMUL(m, n, k, a, a_type, b, b_col, c, c_col) \
+//   do { \
+//     llamafile_sgemm((m), (n), QUANT_BLCK_COUNT((k), (a_type)), (a), QUANT_BLCK_COUNT((k), (a_type)), \
+//                     QUANT_OFFSET((b), VEC_DOT_TYPE((a_type)), (b_col), (k)), \
+//                     QUANT_BLCK_COUNT((k), VEC_DOT_TYPE((a_type))), offset_pointer((c), (c_col) * (m) *
+//                     sizeof(float)), \
+//                     (k), 0, 1, GGML_TASK_TYPE_COMPUTE, (a_type), VEC_DOT_TYPE((a_type)), GGML_TYPE_F32, \
+//                     GGML_PREC_DEFAULT); \
 //   } while (0)
 
-// #define LLAMAFILE_SGEMM_MATMUL_F32(m, n, k, a, lda, b, ldb, c, ldc)                                                    \
-//   do {                                                                                                                 \
-//     llamafile_sgemm((m), (n), (k), (a), (lda), (b), (ldb), (c), (ldc), 0, 1, GGML_TASK_TYPE_COMPUTE, GGML_TYPE_F32,    \
-//                     GGML_TYPE_F32, GGML_TYPE_F32, GGML_PREC_DEFAULT);                                                  \
+// #define LLAMAFILE_SGEMM_MATMUL_F32(m, n, k, a, lda, b, ldb, c, ldc) \
+//   do { \
+//     llamafile_sgemm((m), (n), (k), (a), (lda), (b), (ldb), (c), (ldc), 0, 1, GGML_TASK_TYPE_COMPUTE, GGML_TYPE_F32, \
+//                     GGML_TYPE_F32, GGML_TYPE_F32, GGML_PREC_DEFAULT); \
 //   } while (0)
 
 // // bool decide_absorb(size_t a,int a_type,size_t b,int b_type,size_t c,int c_type,size_t d,int d_type){

diff --git a/kt-kernel/operators/moe_kernel/la/kernel.hpp b/kt-kernel/operators/moe_kernel/la/kernel.hpp
@@ -340,7 +340,7 @@ struct GemmKernelInt8 {
   static inline const int PACK_SIZE_M = 8;
   static inline const int PACK_SIZE_K = 32;
 
-  static std::string name() { return "INT8"; }
+  static std::string name() { return "MOE_INT8"; }
   static int recommended_nth(int n) { return (n + N_BLOCK - 1) / N_BLOCK; }
   // type_: d for decode, p for prefill
   static int recommended_nth_down(int n, char type_ = 'd') {
@@ -833,7 +833,7 @@ struct GemmKernelInt4 {
   static inline const int PACK_SIZE_K = 32;
   static inline const int PACK_SIZE_M = 8;
 
-  static std::string name() { return "INT4"; }
+  static std::string name() { return "MOE_INT4"; }
   static int recommended_nth(int n) { return (n + N_BLOCK - 1) / N_BLOCK; }
 
   static int recommended_nth_down(int n, char type_ = 'd') {