moe quantization support int8 and fp8 (#702)

Chi-Chu319 · web-flow · commit 845d46eb9d46 · 2025-02-24T16:30:50.000+02:00
* moe quantization support int8 and fp8

* code formatting

* dot dtype

* support only fp8 fnuz types

* quantize tensor fixed to quantize for correct types
diff --git a/python/perf-kernels/fused_moe/configs/device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/python/perf-kernels/fused_moe/configs/device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json
@@ -0,0 +1,35 @@
+{
+  "small_M": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 4,
+    "num_warps": 8,
+    "num_stages": 2,
+    "waves_per_eu": 0,
+    "matrix_instr_nonkdim": 16,
+    "kpack": 2
+  },
+  "medium_M": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 2,
+    "waves_per_eu": 0,
+    "matrix_instr_nonkdim": 16,
+    "kpack": 2
+  },
+  "large_M": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 2,
+    "waves_per_eu": 0,
+    "matrix_instr_nonkdim": 16,
+    "kpack": 2
+  }
+}
diff --git a/python/perf-kernels/fused_moe/configs/device_name=AMD_Instinct_MI300X,dtype=int8_w8a16.json b/python/perf-kernels/fused_moe/configs/device_name=AMD_Instinct_MI300X,dtype=int8_w8a16.json
@@ -0,0 +1,35 @@
+{
+  "small_M": {
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 64,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 2,
+    "waves_per_eu": 0,
+    "matrix_instr_nonkdim": 16,
+    "kpack": 2
+  },
+  "medium_M": {
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "BLOCK_SIZE_K": 64,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 2,
+    "waves_per_eu": 0,
+    "matrix_instr_nonkdim": 16,
+    "kpack": 2
+  },
+  "large_M": {
+    "BLOCK_SIZE_M": 128,
+    "BLOCK_SIZE_N": 256,
+    "BLOCK_SIZE_K": 128,
+    "GROUP_SIZE_M": 1,
+    "num_warps": 8,
+    "num_stages": 2,
+    "waves_per_eu": 0,
+    "matrix_instr_nonkdim": 16,
+    "kpack": 2
+  }
+}
diff --git a/python/perf-kernels/fused_moe/moe-gemm.py b/python/perf-kernels/fused_moe/moe-gemm.py