feat: 增加对qwen3的支持

pwhMass · pwhMass · commit a6fe056d3ddb · 2025-07-09T17:47:13.000+08:00
diff --git a/llama.cu/src/exec/group.rs b/llama.cu/src/exec/group.rs
@@ -243,6 +243,8 @@ fn builder() -> GraphBuilder {
         .register_op("swiglu", op::activation::SwiGLU)
         .register_op("concat", op::concat::Concat)
         .register_op("split", op::split::Split)
+        .register_op("tile", op::tile::Tile)
+        .register_op("merge", op::merge::Merge)
         .register_op("all-reduce", op::all_reduce::AllReduce);
     ans
 }
diff --git a/llama.cu/src/model/llama.rs b/llama.cu/src/model/llama.rs
@@ -14,6 +14,7 @@ impl GGufModel<'_> {
         let dt_bias = match arch {
             "llama" => None,
             "qwen2" => Some(self.tensors["blk.0.attn_qkv.bias"].dt()),
+            "qwen3" => None,
             arch => panic!("unsupported arch {arch}"),
         };
 
@@ -23,7 +24,13 @@ impl GGufModel<'_> {
         let d = meta![self => llm_embedding_length];
         let nh = meta![self => llm_attention_head_count];
         let nkvh = meta![self => llm_attention_head_count_kv; nh];
-        let dh = meta![self => llm_rope_dimension_count; d / nh];
+        let dh = match arch {
+            "qwen3" => self.tensors["blk.0.attn_qkv.weight"].shape()[0]
+                .checked_div(nh + nkvh + nkvh)
+                .unwrap(),
+            _ => meta![self => llm_rope_dimension_count; d / nh],
+        };
+        println!("dh: {dh}");
         let di = meta![self => llm_feed_forward_length];
         let epsilon = meta![self => llm_attention_layer_norm_rms_epsilon; 1e-5];
         let dt_linear = self.tensors["blk.0.attn_qkv.weight"].dt();
@@ -68,8 +75,36 @@ impl GGufModel<'_> {
                                 get(&format!("blk.{iblk}.attn_qkv.weight")),
                                 dt_bias.map(|dt| (dt, get(&format!("blk.{iblk}.attn_qkv.bias")))),
                             ),
-                            q_norm: None,
-                            k_norm: None,
+                            q_norm: if self
+                                .tensors
+                                .contains_key(format!("blk.{iblk}.attn_q_norm.weight").as_str())
+                            {
+                                Some(Normalization {
+                                    d: dh,
+                                    epsilon: epsilon as _,
+                                    items: NormType::RmsNorm {
+                                        dt: out_norm.dt(),
+                                        scale: get(&format!("blk.{iblk}.attn_q_norm.weight")),
+                                    },
+                                })
+                            } else {
+                                None
+                            },
+                            k_norm: if self
+                                .tensors
+                                .contains_key(format!("blk.{iblk}.attn_k_norm.weight").as_str())
+                            {
+                                Some(Normalization {
+                                    d: dh,
+                                    epsilon: epsilon as _,
+                                    items: NormType::RmsNorm {
+                                        dt: out_norm.dt(),
+                                        scale: get(&format!("blk.{iblk}.attn_k_norm.weight")),
+                                    },
+                                })
+                            } else {
+                                None
+                            },
                             rope: Some(RoPE {
                                 multimodal: false,
                                 nctx,
@@ -125,13 +160,19 @@ impl GGufModel<'_> {
 
     /// 插入用于 RoPE 的 sin cos 表张量
     pub fn insert_rope_sin_cos(&mut self) {
+        let arch = meta![self => general_architecture];
         let nctx = meta![self => llm_context_length];
         let d = meta![self => llm_embedding_length];
         let nh = meta![self => llm_attention_head_count];
-        let dh = meta![self => llm_rope_dimension_count; d / nh];
+        let nkvh = meta![self => llm_attention_head_count_kv; nh];
+        let dh = match arch {
+            "qwen3" => self.tensors["blk.0.attn_qkv.weight"].shape()[0]
+                .checked_div(nh + nkvh + nkvh)
+                .unwrap(),
+            _ => meta![self => llm_rope_dimension_count; d / nh],
+        };
         let theta = meta![self => llm_rope_freq_base; 1e4];
 
-        let arch = meta![self => general_architecture];
         let [sin, cos] = match self.get_str(&format!("{arch}.rope.scaling.type")) {
             Ok("longrope") => {
                 let ctx_scale = 1.;
@@ -159,13 +200,19 @@ impl GGufModel<'_> {
 
     /// 构造语言模型的 kv cache 张量
     pub fn lm_kv_cache<const N: usize>(&self) -> Tensor<usize, N> {
+        let arch = meta![self => general_architecture];
         let dt = self.tensors["token_embd.weight"].dt();
         let nblk = meta![self => llm_block_count];
         let nctx = meta![self => llm_context_length];
         let d = meta![self => llm_embedding_length];
         let nh = meta![self => llm_attention_head_count];
         let nkvh = meta![self => llm_attention_head_count_kv; nh];
-        let dh = meta![self => llm_rope_dimension_count; d / nh];
+        let dh = match arch {
+            "qwen3" => self.tensors["blk.0.attn_qkv.weight"].shape()[0]
+                .checked_div(nh + nkvh + nkvh)
+                .unwrap(),
+            _ => meta![self => llm_rope_dimension_count; d / nh],
+        };
         Tensor::from_dim_slice(dt, [nctx, nblk, 2, nkvh, dh])
     }
 }
diff --git a/llama.cu/src/op/rms_norm.cuh b/llama.cu/src/op/rms_norm.cuh
@@ -29,6 +29,36 @@ static __device__ void padding(
     *y = Ta(rms * x * w);
 }
 
+// 三维张量的 padding 函数
+template <unsigned int BLOCK_SIZE, class Tw, class Ta>
+static __device__ void padding_3d(
+    Ta *__restrict__ y_,
+    int const stride_y_batch,
+    int const stride_y_seq,
+    Ta const *__restrict__ x_,
+    int const stride_x_batch,
+    int const stride_x_seq,
+    Tw const *__restrict__ w_,
+    float const epsilon) {
+
+    // blockIdx.x = batch index, blockIdx.y = seq index
+    auto y = y_ + blockIdx.x * stride_y_batch + blockIdx.y * stride_y_seq + threadIdx.x;
+    float const x = x_[blockIdx.x * stride_x_batch + blockIdx.y * stride_x_seq + threadIdx.x];
+    float const w = w_[threadIdx.x];
+
+    using BlockOp = cub::BlockReduce<float, BLOCK_SIZE>;
+    __shared__ typename BlockOp::TempStorage temp_storage;
+    auto acc = BlockOp(temp_storage).Reduce(x * x, cub::Sum());
+
+    __shared__ float rms;
+    if (threadIdx.x == 0) {
+        rms = rsqrtf(acc / float(blockDim.x) + epsilon);
+    }
+    __syncthreads();
+
+    *y = Ta(rms * x * w);
+}
+
 template <unsigned int BLOCK_SIZE, unsigned int NUM_ITEMS_THREAD, class Tw, class Ta>
 static __device__ void folding(
     Ta *__restrict__ y,
@@ -79,3 +109,59 @@ static __device__ void folding(
         BlockOp(temp_storage).Store(y, data, items_size);
     }
 }
+
+// 三维张量的 folding 函数
+template <unsigned int BLOCK_SIZE, unsigned int NUM_ITEMS_THREAD, class Tw, class Ta>
+static __device__ void folding_3d(
+    Ta *__restrict__ y,
+    int const stride_y_batch,
+    int const stride_y_seq,
+    Ta const *__restrict__ x,
+    int const stride_x_batch,
+    int const stride_x_seq,
+    Tw const *__restrict__ w,
+    float const epsilon,
+    unsigned int const items_size) {
+
+    // blockIdx.x = batch index, blockIdx.y = seq index
+    y += blockIdx.x * stride_y_batch + blockIdx.y * stride_y_seq;
+    x += blockIdx.x * stride_x_batch + blockIdx.y * stride_x_seq;
+
+    float data[NUM_ITEMS_THREAD], weight[NUM_ITEMS_THREAD];
+    {
+        using BlockOp = cub::BlockLoad<float, BLOCK_SIZE, NUM_ITEMS_THREAD>;
+        __shared__ typename BlockOp::TempStorage temp_storage;
+        BlockOp(temp_storage).Load(x, data, items_size, 0.f);
+        BlockOp(temp_storage).Load(w, weight, items_size, 0.f);
+    }
+
+    float squared = 0;
+#pragma unroll
+    for (unsigned int i = 0; i < NUM_ITEMS_THREAD; ++i) {
+        squared += data[i] * data[i];
+    }
+
+    float acc;
+    {
+        using BlockOp = cub::BlockReduce<float, BLOCK_SIZE>;
+        __shared__ typename BlockOp::TempStorage temp_storage;
+        acc = BlockOp(temp_storage).Reduce(squared, cub::Sum());
+    }
+
+    __shared__ float rms;
+    if (threadIdx.x == 0) {
+        rms = rsqrtf(acc / float(items_size) + epsilon);
+    }
+    __syncthreads();
+
+#pragma unroll
+    for (unsigned int i = 0; i < NUM_ITEMS_THREAD; ++i) {
+        data[i] = rms * data[i] * weight[i];
+    }
+
+    {
+        using BlockOp = cub::BlockStore<float, BLOCK_SIZE, NUM_ITEMS_THREAD>;
+        __shared__ typename BlockOp::TempStorage temp_storage;
+        BlockOp(temp_storage).Store(y, data, items_size);
+    }
+}
diff --git a/llama.cu/src/op/rms_norm.rs b/llama.cu/src/op/rms_norm.rs

Original file line number	Diff line number	Diff line change
`@@ -243,6 +243,8 @@ fn builder() -> GraphBuilder {`
`243`	`243`	`.register_op("swiglu", op::activation::SwiGLU)`
`244`	`244`	`.register_op("concat", op::concat::Concat)`
`245`	`245`	`.register_op("split", op::split::Split)`
	`246`	`+ .register_op("tile", op::tile::Tile)`
	`247`	`+ .register_op("merge", op::merge::Merge)`
`246`	`248`	`.register_op("all-reduce", op::all_reduce::AllReduce);`
`247`	`249`	`ans`
`248`	`250`	`}`