ModelTC
diff --git a/‎lightllm/common/offline_fp8_quant_mem_manager.py‎
Lines changed: 38 additions & 18 deletions b/‎lightllm/common/offline_fp8_quant_mem_manager.py‎
Lines changed: 38 additions & 18 deletions
diff --git a/‎lightllm/models/llama/layer_infer/transformer_layer_infer.py‎
Lines changed: 56 additions & 2 deletions b/‎lightllm/models/llama/layer_infer/transformer_layer_infer.py‎
Lines changed: 56 additions & 2 deletions
diff --git a/‎lightllm/models/llama/model.py‎
Lines changed: 1 addition & 1 deletion b/‎lightllm/models/llama/model.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lightllm/server/api_cli.py‎
Lines changed: 3 additions & 2 deletions b/‎lightllm/server/api_cli.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎lightllm/server/api_start.py‎
Lines changed: 12 additions & 2 deletions b/‎lightllm/server/api_start.py‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎lightllm/utils/envs_utils.py‎
Lines changed: 1 addition & 1 deletion b/‎lightllm/utils/envs_utils.py‎
Lines changed: 1 addition & 1 deletion
@@ -30,25 +30,33 @@ def __init__(
         self.total_head_num = head_num * dist.get_world_size() if dist.is_initialized() else head_num
         self.count = 0
         self.scales = None
+        self.scales_list = []
         self.abs_max = None
 
         if is_export_mode:
-            self.abs_max = torch.zeros((layer_num, 2 * head_num), dtype=torch.float32, device="cuda")
+            scales_shape = [layer_num, 2 * head_num] if get_env_start_args().enable_fa3 else [layer_num, 2]
+            self.abs_max = torch.zeros(scales_shape, dtype=torch.float32, device="cuda")
         elif get_env_start_args().kv_quant_calibration_config_path is not None:
             logger.info(
                 f"kv_quant_calibration_config_path {get_env_start_args().kv_quant_calibration_config_path} is set, "
                 "will load kv quant calibration config"
             )
             cfg = self._load_and_check_config()
 
-            self.scales = torch.tensor(cfg["scales"], dtype=torch.float32, device="cuda").view(cfg["scales_shape"])
-            if dist.is_initialized() and dist.get_world_size() > 1:
+            self.scales_list = cfg["scales"]
+            self.scales = torch.tensor(self.scales_list, dtype=torch.float32, device="cuda").view(cfg["scales_shape"])
+            if not get_env_start_args().enable_fa3:
+                self.scales = torch.repeat_interleave(self.scales, self.head_num, dim=-1)
+            if get_env_start_args().enable_fa3 and dist.is_initialized() and dist.get_world_size() > 1:
                 half_head = self.total_head_num // 2
                 start_head = dist.get_rank() * head_num
                 end_head = start_head + head_num
                 k_scales = self.scales[:, start_head:end_head].contiguous()
                 v_scales = self.scales[:, start_head + half_head : end_head + half_head].contiguous()
-                self.scales = torch.cat((k_scales, v_scales), dim=-1)
+                current_scales = torch.cat((k_scales, v_scales), dim=-1)
+
+                self.scales_list = current_scales.tolist()
+                self.scales = current_scales
         else:
             logger.warning("scales is None, no kv_quant_calibration_config_path be set, will use 1.0 as scales")
 
@@ -74,8 +82,12 @@ def _load_and_check_config(self):
                 raise ValueError(
                     f"num_head {cfg['num_head']} in config " f"not match current model head num {self.total_head_num}"
                 )
-            if cfg["quant_type"] != "per_head":
-                raise ValueError(f"quant type {cfg['quant_type']} in config not match fa3 backend")
+            if get_env_start_args().enable_fa3:
+                if cfg["quant_type"] != "per_head":
+                    raise ValueError(f"quant type {cfg['num_head']} in config not match fa3 backend")
+            else:
+                if cfg["quant_type"] != "per_tensor":
+                    raise ValueError(f"quant type {cfg['quant_type']} in config not match flashinfer backend")
 
             return cfg
         else:
@@ -93,21 +105,29 @@ def update_calibration_data(self, kv_buffer: torch.Tensor, layer_index: int):
             logger.info("kv cache calibration mode will collect kv cache data for quantization calibration")
 
         if self.abs_max is not None and self.count >= warmup_counts:
-            kv_max = kv_buffer.abs().amax(dim=(0, 2)).to(torch.float32)
+            if get_env_start_args().enable_fa3:
+                kv_max = kv_buffer.abs().amax(dim=(0, 2)).to(torch.float32)
+            else:
+                k_max = kv_buffer[:, : self.head_num, :].abs().amax(dim=()).to(torch.float32)
+                v_max = kv_buffer[:, self.head_num :, :].abs().amax(dim=()).to(torch.float32)
+                kv_max = torch.tensor([k_max, v_max], device="cuda", dtype=torch.float32)
             self.abs_max[layer_index] = torch.maximum(self.abs_max[layer_index], kv_max)
             if self.count == warmup_counts + inference_counts - 1 and layer_index == self.layer_num - 1:
                 final_abs_max = self.abs_max
                 if dist.is_initialized() and dist.get_world_size() > 1:
-                    k_max, v_max = torch.chunk(self.abs_max, 2, dim=-1)
-                    k_max = k_max.contiguous()
-                    v_max = v_max.contiguous()
-                    gathered_k_max = [torch.zeros_like(k_max) for _ in range(dist.get_world_size())]
-                    gathered_v_max = [torch.zeros_like(v_max) for _ in range(dist.get_world_size())]
-                    dist.all_gather(gathered_k_max, k_max, group=None, async_op=False)
-                    dist.all_gather(gathered_v_max, v_max, group=None, async_op=False)
-                    k_max = torch.cat(gathered_k_max, dim=-1)
-                    v_max = torch.cat(gathered_v_max, dim=-1)
-                    final_abs_max = torch.cat((k_max, v_max), dim=-1)
+                    if get_env_start_args().enable_fa3:
+                        k_max, v_max = torch.chunk(self.abs_max, 2, dim=-1)
+                        k_max = k_max.contiguous()
+                        v_max = v_max.contiguous()
+                        gathered_k_max = [torch.zeros_like(k_max) for _ in range(dist.get_world_size())]
+                        gathered_v_max = [torch.zeros_like(v_max) for _ in range(dist.get_world_size())]
+                        dist.all_gather(gathered_k_max, k_max, group=None, async_op=False)
+                        dist.all_gather(gathered_v_max, v_max, group=None, async_op=False)
+                        k_max = torch.cat(gathered_k_max, dim=-1)
+                        v_max = torch.cat(gathered_v_max, dim=-1)
+                        final_abs_max = torch.cat((k_max, v_max), dim=-1)
+                    else:
+                        dist.all_reduce(self.abs_max, op=dist.ReduceOp.MAX, group=None, async_op=False)
 
                 self.scales = final_abs_max / self.qmax
                 self.scales = torch.where(self.scales > 0, self.scales, torch.ones_like(self.scales))
@@ -124,7 +144,7 @@ def _export_calibration_data(self):
         cfg = {
             "version": "1.0",
             "architectures": model_arch,
-            "quant_type": "per_head",
+            "quant_type": "per_head" if get_env_start_args().enable_fa3 else "per_tensor",
             "qmin": self.qmin,
             "qmax": self.qmax,
             "num_layers": self.layer_num,
 
@@ -129,6 +129,15 @@ def _bind_attention(self):
         elif "triton_int8kv" in self.mode:
             self._token_attention_kernel = partial(LlamaTransformerLayerInfer._token_decode_attention_int8kv, self)
             self._copy_kv_to_mem_cache = partial(LlamaTransformerLayerInfer._copy_kv_to_mem_cache_int8kv, self)
+        elif "offline_calibration_fp8kv" in self.mode:
+            assert get_env_start_args().enable_flashinfer_prefill and get_env_start_args().enable_flashinfer_decode
+            self._copy_kv_to_mem_cache = partial(LlamaTransformerLayerInfer._copy_kv_to_mem_cache_fp8kv, self)
+            self._context_attention_kernel = partial(
+                LlamaTransformerLayerInfer._context_attention_flashinfer_kernel_fp8, self
+            )
+            self._token_attention_kernel = partial(
+                LlamaTransformerLayerInfer._token_decode_attention_flashinfer_fp8, self
+            )
         elif "triton_flashdecoding" in self.mode:
             self._token_attention_kernel = partial(
                 LlamaTransformerLayerInfer._token_decode_attention_flashdecoding, self
@@ -147,14 +156,19 @@ def _bind_attention(self):
                 LlamaTransformerLayerInfer._token_decode_attention_gqa_flashdecoding_vsm, self
             )
             self._copy_kv_to_mem_cache = partial(LlamaTransformerLayerInfer._copy_kv_to_mem_cache_normal, self)
-        elif not self.mode:
+        elif "export_fp8kv_calibration" in self.mode or not self.mode:
             if get_env_start_args().enable_flashinfer_decode:
                 self._token_attention_kernel = partial(
                     LlamaTransformerLayerInfer._token_decode_attention_flashinfer, self
                 )
             else:
                 self._token_attention_kernel = partial(LlamaTransformerLayerInfer._token_decode_attention_normal, self)
-            self._copy_kv_to_mem_cache = partial(LlamaTransformerLayerInfer._copy_kv_to_mem_cache_normal, self)
+            if "export_fp8kv_calibration" in self.mode:
+                self._copy_kv_to_mem_cache = partial(
+                    LlamaTransformerLayerInfer._copy_kv_to_mem_cache_with_calibration, self
+                )
+            else:
+                self._copy_kv_to_mem_cache = partial(LlamaTransformerLayerInfer._copy_kv_to_mem_cache_normal, self)
         else:
             raise Exception(f"Unsupported mode: {self.mode}")
 
@@ -214,6 +228,26 @@ def _tpsp_get_qkv(
         )
         return q, cache_kv
 
+    def _context_attention_flashinfer_kernel_fp8(
+        self, q, kv, infer_state: LlamaFlashInferStateInfo, layer_weight, out=None
+    ) -> torch.Tensor:
+        o_tensor = self.alloc_tensor(q.shape, q.dtype) if out is None else out
+        kv = infer_state.mem_manager.kv_buffer[self.layer_num_]
+        kv = kv.unsqueeze(1)
+        k = kv[:, :, : self.tp_k_head_num_, :].view(torch.float8_e4m3fn)
+        v = kv[:, :, self.tp_k_head_num_ :, :].view(torch.float8_e4m3fn)
+        offline_scales = infer_state.mem_manager.scales_list
+        k_descale = offline_scales[self.layer_num_][0] if offline_scales is not None else None
+        v_descale = offline_scales[self.layer_num_][1] if offline_scales is not None else None
+        infer_state.prefill_wrapper.run(
+            q.view(q.shape[0], -1, self.head_dim_),
+            (k, v),
+            k_scale=k_descale,
+            v_scale=v_descale,
+            out=o_tensor.view(q.shape[0], -1, self.head_dim_),
+        )
+        return o_tensor
+
     def _context_attention_flashinfer_kernel(
         self, q, kv, infer_state: LlamaFlashInferStateInfo, layer_weight, out=None
     ) -> torch.Tensor:
@@ -474,6 +508,26 @@ def _copy_kv_to_mem_cache_ppl_int4kv(self, buffer, mem_index, mem_manager):
         )
         return
 
+    def _token_decode_attention_flashinfer_fp8(self, q, infer_state: LlamaFlashInferStateInfo, layer_weight, out=None):
+        batch_size = infer_state.batch_size
+        calcu_shape1 = (batch_size, self.tp_q_head_num_, self.head_dim_)
+
+        o_tensor = self.alloc_tensor(q.shape, q.dtype) if out is None else out
+        kv = infer_state.mem_manager.kv_buffer[self.layer_num_].unsqueeze(1)
+        k = kv[:, :, : self.tp_k_head_num_, :].view(torch.float8_e4m3fn)
+        v = kv[:, :, self.tp_k_head_num_ :, :].view(torch.float8_e4m3fn)
+        offline_scales = infer_state.mem_manager.scales_list
+        k_descale = offline_scales[self.layer_num_][0] if offline_scales is not None else None
+        v_descale = offline_scales[self.layer_num_][1] if offline_scales is not None else None
+        infer_state.decode_wrapper.run(
+            q.view(calcu_shape1),
+            (k, v),
+            k_scale=k_descale,
+            v_scale=v_descale,
+            out=o_tensor.view(calcu_shape1),
+        )
+        return o_tensor
+
     def _token_decode_attention_flashinfer(self, q, infer_state: LlamaFlashInferStateInfo, layer_weight, out=None):
         batch_size = infer_state.batch_size
         calcu_shape1 = (batch_size, self.tp_q_head_num_, self.head_dim_)
 
@@ -41,7 +41,7 @@ def __init__(self, model):
             ),
         ]
         self.q_data_type = model.data_type
-        self.kv_data_type = model.data_type
+        self.kv_data_type = torch.float8_e4m3fn if "offline_calibration_fp8kv" in model.mode else model.data_type
 
 
 @ModelRegistry("llama")
 
@@ -170,10 +170,11 @@ def make_argument_parser() -> argparse.ArgumentParser:
                         triton_gqa_attention and triton_gqa_flashdecoding is fast kernel for model which use GQA;
                         triton_int8kv mode use int8 to store kv cache, can increase token capacity, use triton kernel;
                         triton_fp8kv mode use float8 to store kv cache, currently only for deepseek2;
-                        offline_calibration_fp8kv mode use float8 to store kv cache, need fa3 backend,
+                        offline_calibration_fp8kv mode use float8 to store kv cache, need fa3 or flashinfer backend,
                         currently only for llama and qwen model;
                         export_fp8kv_calibration record and export kv cache quant calibration results to a json file.
-                        It can be used for llama and qwen model. Calibration need to disable cudagraph and fa3 backend.
+                        It can be used for llama and qwen model.
+                        Calibration need to disable cudagraph and use fa3 or flashinfer backend.
                         ppl_int8kv mode use int8 to store kv cache, and use ppl fast kernel;
                         ppl_fp16 mode use ppl fast fp16 decode attention kernel;
                         you need to read source code to make sure the supported detail mode for all models""",
 
@@ -111,9 +111,19 @@ def normal_or_p_d_start(args):
         assert args.disable_dynamic_prompt_cache is True, "need add --disable_dynamic_prompt_cache"
         assert args.disable_chunked_prefill is True, "need add --disable_chunked_prefill"
     if "offline_calibration_fp8kv" in args.mode:
-        assert args.enable_fa3 is True, "offline_calibration_fp8kv mode need enable fa3"
+        assert args.enable_fa3 is True or (
+            args.enable_flashinfer_prefill is True and args.enable_flashinfer_decode is True
+        ), (
+            "offline_calibration_fp8kv mode need enable fa3 or flashinfer, add --enable_fa3 or "
+            "--enable_flashinfer_prefill and --enable_flashinfer_decode"
+        )
     if "export_fp8kv_calibration" in args.mode:
-        assert args.enable_fa3 is True, "export_fp8kv_calibration mode need enable fa3"
+        assert args.enable_fa3 is True or (
+            args.enable_flashinfer_prefill is True and args.enable_flashinfer_decode is True
+        ), (
+            "export_fp8kv_calibration mode need enable fa3 or flashinfer, add --enable_fa3 or "
+            "--enable_flashinfer_prefill and --enable_flashinfer_decode"
+        )
         assert args.disable_cudagraph is True, "export_fp8kv_calibration mode need disable cudagraph"
 
     # 部分模式还不能支持与高级动态调度算法协同，to do.
 
@@ -139,7 +139,7 @@ def get_redundancy_expert_update_max_load_count():
 
 @lru_cache(maxsize=None)
 def get_kv_quant_calibration_warmup_count():
-    # 服务启动后前warmup次推理不计入量化校准统计，该参数可以控制在一个更大的校准数据集不同位置开始校准。
+    # 服务启动后前warmup次推理不计入量化校准统计，该参数可以控制在一个更大的校准数据集的不同位置处开始校准。
     return int(os.getenv("LIGHTLLM_KV_QUANT_CALIBRARTION_WARMUP_COUNT", 0))
Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,7 @@ def __init__(self, model):`
`41`	`41`	`),`
`42`	`42`	`]`
`43`	`43`	`self.q_data_type = model.data_type`
`44`		`- self.kv_data_type = model.data_type`
	`44`	`+ self.kv_data_type = torch.float8_e4m3fn if "offline_calibration_fp8kv" in model.mode else model.data_type`
`45`	`45`
`46`	`46`
`47`	`47`	`@ModelRegistry("llama")`