fix vit6b precision

shihaobai · shihaobai · commit 71a9e560893f · 2024-12-11T23:01:11.000+08:00
diff --git a/lightllm/models/vit/layer_infer/transformer_layer_infer.py b/lightllm/models/vit/layer_infer/transformer_layer_infer.py
@@ -46,7 +46,8 @@ def tp_norm(self, input, weight):
         input_dtype = input.dtype
         input = input.to(torch.float32)
         tp_variance = input.pow(2).sum(-1, keepdim=True)
-        # dist.all_reduce(tp_variance, op=dist.ReduceOp.SUM, async_op=False)
+        if self.world_size_ > 1:
+            dist.all_reduce(tp_variance, op=dist.ReduceOp.SUM, async_op=False)
         variance = tp_variance / self.embed_dim_
         input = input * torch.rsqrt(variance + self.eps_)
         out = weight * input.to(input_dtype)
@@ -79,8 +80,8 @@ def _ffn_norm(self, input, layer_weight: ViTTransformerLayerWeight) -> torch.Ten
             )
 
     def _qk_norm(self, q, k, layer_weight: ViTTransformerLayerWeight) -> torch.Tensor:
-        q_norm = self.norm(q, layer_weight.q_norm_weight_.weight)
-        k_norm = self.norm(k, layer_weight.k_norm_weight_.weight)
+        q_norm = self.tp_norm(q, layer_weight.q_norm_weight_.weight)
+        k_norm = self.tp_norm(k, layer_weight.k_norm_weight_.weight)
         return q_norm, k_norm
 
     def _get_qkv(self, input, layer_weight: ViTTransformerLayerWeight) -> torch.Tensor:
@@ -89,9 +90,6 @@ def _get_qkv(self, input, layer_weight: ViTTransformerLayerWeight) -> torch.Tens
         qkv = layer_weight.qkv_proj.mm(input.view(-1, self.embed_dim_), use_custom_tensor_mananger=False)
         qkv = qkv.view(batch_size, seq_len, 3, -1, self.head_dim_)
         q, k, v = qkv.unbind(2)
-        q = q.contiguous()
-        k = k.contiguous()
-        v = v.contiguous()
         return q, k, v
 
     def _context_attention_kernel(self, q, k, v) -> torch.Tensor:
diff --git a/lightllm/models/vit/layer_weights/transformer_layer_weight.py b/lightllm/models/vit/layer_weights/transformer_layer_weight.py
@@ -163,6 +163,5 @@ def load_hf_weights(self, weights):
             ls2 = weights[f"vision_model.encoder.layers.{self.layer_num_}.ls2"]
             self.ls2 = self._cuda(ls2)
             self.use_ls = True
-            print(self.ls1)
 
         return super().load_hf_weights(weights)
diff --git a/lightllm/models/vit/triton_kernel/flashattention_nopad.py b/lightllm/models/vit/triton_kernel/flashattention_nopad.py
@@ -22,6 +22,14 @@ def _fwd_kernel(
         q_stride_s,
         q_stride_h,
         q_stride_d,
+        k_stride_b,
+        k_stride_s,
+        k_stride_h,
+        k_stride_d,
+        v_stride_b,
+        v_stride_s,
+        v_stride_h,
+        v_stride_d,
         o_stride_b,
         o_stride_s,
         o_stride_h,
@@ -30,9 +38,9 @@ def _fwd_kernel(
         BLOCK_DMODEL: tl.constexpr,
         BLOCK_N: tl.constexpr,
     ):
-        cur_batch = tl.program_id(0)
+        cur_batch = tl.program_id(2)
         cur_head = tl.program_id(1)
-        start_m = tl.program_id(2)
+        start_m = tl.program_id(0)
 
         # initialize offsets
         offs_n = tl.arange(0, BLOCK_N)
@@ -49,9 +57,9 @@ def _fwd_kernel(
             start_n = tl.multiple_of(start_n, BLOCK_N)
             # -- compute qk ----
             off_k = (
-                cur_batch * q_stride_b
-                + (start_n + offs_n[None, :]) * q_stride_s
-                + cur_head * q_stride_h
+                cur_batch * k_stride_b
+                + (start_n + offs_n[None, :]) * k_stride_s
+                + cur_head * k_stride_h
                 + offs_d[:, None]
             )
             k = tl.load(K + off_k, mask=(start_n + offs_n[None, :]) < seq_len, other=0.0)
@@ -71,9 +79,9 @@ def _fwd_kernel(
 
             # update acc
             off_v = (
-                cur_batch * q_stride_b
-                + (start_n + offs_n[:, None]) * q_stride_s
-                + cur_head * q_stride_h
+                cur_batch * v_stride_b
+                + (start_n + offs_n[:, None]) * v_stride_s
+                + cur_head * v_stride_h
                 + offs_d[None, :]
             )
             v = tl.load(V + off_v, mask=(start_n + offs_n[:, None]) < seq_len, other=0.0)
@@ -104,8 +112,8 @@ def flash_attention_fwd(
         batch_size, seq_len, head_num, head_dim = q.shape
 
         sm_scale = 1.0 / (head_dim ** 0.5)  # 计算scale系数
-        grid = (batch_size, head_num, triton.cdiv(seq_len, BLOCK))  # batch, head,
-        # grid = (triton.cdiv(seq_len, BLOCK), batch_size, head_num)  # batch, head,
+        # grid = (batch_size, head_num, triton.cdiv(seq_len, BLOCK))  # batch, head,
+        grid = (triton.cdiv(seq_len, BLOCK), head_num, batch_size)  # batch, head,
         num_warps = 4
         _fwd_kernel[grid](
             q,
@@ -118,6 +126,14 @@ def flash_attention_fwd(
             q.stride(1),
             q.stride(2),
             q.stride(3),
+            k.stride(0),
+            k.stride(1),
+            k.stride(2),
+            k.stride(3),
+            v.stride(0),
+            v.stride(1),
+            v.stride(2),
+            v.stride(3),
             o.stride(0),
             o.stride(1),
             o.stride(2),
@@ -157,7 +173,6 @@ def test():
     k = torch.empty((B, L, H, D), dtype=dtype, device="cuda").normal_(mean=0.1, std=0.2)
     v = torch.empty((B, L, H, D), dtype=dtype, device="cuda").normal_(mean=0.1, std=0.2)
     o = torch.empty((B, L, H, D), dtype=dtype, device="cuda").normal_(mean=0.1, std=0.2)
-
     torch_out = torch_att(q, k, v)
     import time
 
@@ -174,6 +189,3 @@ def test():
     print("max ", torch.max(torch.abs(torch_out - o)))
     print("mean ", torch.mean(torch.abs(torch_out - o)))
     assert torch.allclose(torch_out, o, atol=1e-2, rtol=0)
-
-
-# test()