xlite-dev
diff --git a/‎README.md
Lines changed: 1 addition & 1 deletion b/‎README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎kernels/layer-norm/layer_norm.py
Lines changed: 1 addition & 1 deletion b/‎kernels/layer-norm/layer_norm.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎kernels/ws-hgemm/README.md
Lines changed: 23 additions & 0 deletions b/‎kernels/ws-hgemm/README.md
Lines changed: 23 additions & 0 deletions
@@ -39,7 +39,7 @@
 <div id="news"></div>
 
 - [2025-07-13] **[🤗flux-faster](https://github.com/xlite-dev/flux-faster)** is released! A forked version of [huggingface/flux-fast](https://github.com/huggingface/flux-fast) that **makes flux-fast even faster** with **[cache-dit](https://github.com/vipshop/cache-dit)**, **3.3x** speedup on NVIDIA L20 while still maintaining **high precision**.
-  
+
 - [2025-06-16]: [🤗CacheDiT](https://github.com/vipshop/cache-dit) is released! A **Training-free** and **Easy-to-use** Cache Acceleration Toolbox for Diffusion Transformers (**DBCache**, **DBPrune**, **TaylorSeer**, **FBCache**, etc)🔥. Feel free to take a try!
 
 <div align='center'>
 
@@ -98,7 +98,7 @@ def run_benchmark(
 
 print(" " * 40 + f"f16 overflow without f32")
 print("-" * 85)
-x_f16 = x.half() * 100 # this will cause overflow for kernels without `f32`
+x_f16 = x.half() * 100  # this will cause overflow for kernels without `f32`
 run_benchmark(lib.layer_norm_f16_f16, x_f16, "f16f16", out_f16)
 run_benchmark(lib.layer_norm_f16_f32, x_f16, "f16f32", out_f16)
 run_benchmark(lib.layer_norm_f16x2_f16, x_f16, "f16x2f16", out_f16)
 
@@ -0,0 +1,23 @@
+# Warp Specialization HGemm
+
+## 0x00 说明
+
+包含以下内容：
+
+- [X] ws_hgemm_naive_cute_kernel
+
+
+## 测试
+
+```bash
+python3 ws_hgemm.py
+```
+
+输出:
+
+```bash
+--------------------------------------------------------------------------------
+out_ws_hgemm_naive_cute: [4096.0, 4096.0, 4096.0], time:3.71974587ms
+out_f16_th: [4096.0, 4096.0, 4096.0], time:5.05561471ms
+--------------------------------------------------------------------------------
+```