1. fix attention causal; rename to align the naming for wideep path of sglang; update readme a little bit to reflect; update attn collector to collect qheads 1,2 (#101)

tianhaox · web-flow · commit 45033ab448fd · 2025-11-10T20:38:47.000+08:00
diff --git a/collector/README.md b/collector/README.md
@@ -85,13 +85,13 @@ export OUTPUT_PATH=/path/to/output
 
 # Run DeepSeek-specific attention collector
 SGLANG_LOAD_FORMAT=dummy SGLANG_TEST_NUM_LAYERS=2 \
-  python collect_attn.py --model_path $MODEL_PATH --output_path $OUTPUT_PATH
+  python collect_wideep_attn.py --model_path $MODEL_PATH --output_path $OUTPUT_PATH
 
 # Run DeepSeek MLP collector
-python collect_mlp.py --model_path $MODEL_PATH --output_path $OUTPUT_PATH
+python collect_wideep_mlp.py --model_path $MODEL_PATH --output_path $OUTPUT_PATH
 
 # Run DeepSeek DeepEP MoE collector (requires 2+ GPUs)
-python collect_deepep_moe.py --model_path $MODEL_PATH --output_path $OUTPUT_PATH \
+python collect_wideep_deepep_moe.py --model_path $MODEL_PATH --output_path $OUTPUT_PATH \
   --tp_size 2 --ep_size 2 --num_experts 256
 ```
 See `sglang/README.md` for detailed documentation on these collectors.
diff --git a/collector/collect.py b/collector/collect.py
@@ -403,14 +403,14 @@ def collect_sglang(num_processes: int, ops: list[str] | None = None):
         {
             "name": "sglang",
             "type": "attention_context",
-            "module": "collector.sglang.collect_normal_attn",
+            "module": "collector.sglang.collect_attn",
             "get_func": "get_context_attention_test_cases",
             "run_func": "run_attention_torch",
         },
         {
             "name": "sglang",
             "type": "attention_generation",
-            "module": "collector.sglang.collect_normal_attn",
+            "module": "collector.sglang.collect_attn",
             "get_func": "get_generation_attention_test_cases",
             "run_func": "run_attention_torch",
         },
diff --git a/collector/deep_collector/extract_data.py b/collector/deep_collector/extract_data.py
@@ -738,13 +738,13 @@ def main():
     )
     parser.add_argument(
         "--output-normal",
-        default="./deepep_normal_perf.txt",
-        help="normal output TXT file path (default: ./deepep_normal_perf.txt)",
+        default="./wideep_deepep_normal_perf.txt",
+        help="normal output TXT file path (default: ./wideep_deepep_normal_perf.txt)",
     )
     parser.add_argument(
         "--output-ll",
-        default="./deepep_ll_perf.txt",
-        help="ll output TXT file path (default: ./deepep_ll_perf.txt)",
+        default="./wideep_deepep_ll_perf.txt",
+        help="ll output TXT file path (default: ./wideep_deepep_ll_perf.txt)",
     )
     args = parser.parse_args()
 
diff --git a/collector/sglang/README.md b/collector/sglang/README.md
@@ -14,9 +14,9 @@ The collected performance data can be used for performance modeling, scheduling
 
 ## Overview
 
-- **collect_deepseek_attn.py**: Collects performance data for DeepSeek Attention (MLA) operators
-- **collect_deepep_moe.py**: Collects performance data for DeepSeek MoE operators
-- **collect_deepseek_mlp.py**: Collects performance data for Shared Expert (MLP) operators
+- **collect_wideep_attn.py**: Collects performance data for DeepSeek Attention (MLA) operators
+- **collect_wideep_deepep_moe.py**: Collects performance data for DeepSeek MoE operators
+- **collect_wideep_mlp.py**: Collects performance data for Shared Expert (MLP) operators
 
 ## Requirements
 
@@ -34,7 +34,7 @@ output_path = "/aiconfigurator/src/aiconfigurator/systems/data/h100_sxm/sglang/0
 ```
 
 
-## 1. Attention Operator Collection (collect_deepseek_attn.py)
+## 1. Attention Operator Collection (collect_wideep_attn.py)
 
 ### Features
 - Tests different attention backends (flashinfer, fa3)
@@ -47,7 +47,7 @@ output_path = "/aiconfigurator/src/aiconfigurator/systems/data/h100_sxm/sglang/0
 #### Basic Run with dummy weight
 ```bash
 export DEEPSEEK_MODEL_PATH=/path/to/deepseek-v3
-python collect_deepseek_attn.py
+python collect_wideep_attn.py
 ```
 #### Environment Variables
 - `DEEPSEEK_MODEL_PATH`: Path to DeepSeek model 
@@ -72,7 +72,7 @@ Output format:
 framework,version,device,op_name,kernel_source,mla_dtype,kv_cache_dtype,num_heads,batch_size,isl,tp_size,step,latency
 ```
 
-## 2. MoE Operator Collection (collect_deepep_moe.py)
+## 2. MoE Operator Collection (collect_wideep_deepep_moe.py)
 
 ### Features
 - Tests DeepEP MoE operator performance
@@ -85,7 +85,7 @@ framework,version,device,op_name,kernel_source,mla_dtype,kv_cache_dtype,num_head
 #### Basic Run
 ```bash
 export DEEPSEEK_MODEL_PATH=/path/to/deepseek-v3
-python collect_deepep_moe.py
+python collect_wideep_deepep_moe.py
 ```
 
 #### Environment Variables
@@ -139,7 +139,7 @@ Output format:
 framework,version,device,op_name,kernel_source,moe_dtype,num_tokens,hidden_size,inter_size,topk,num_experts,moe_tp_size,moe_ep_size,distribution,latency
 ```
 
-## 3. MLP Operator Collection (collect_deepseek_mlp.py)
+## 3. MLP Operator Collection (collect_wideep_mlp.py)
 
 ### Features
 - Tests DeepSeek V2/V3 MLP operator performance
@@ -151,7 +151,7 @@ framework,version,device,op_name,kernel_source,moe_dtype,num_tokens,hidden_size,
 #### Basic Run
 ```bash
 export DEEPSEEK_MODEL_PATH=/path/to/deepseek-v3
-python collect_deepseek_mlp.py
+python collect_wideep_mlp.py
 ```
 
 #### Environment Variables
diff --git a/collector/sglang/collect_attn.py b/collector/sglang/collect_attn.py
@@ -23,7 +23,7 @@ def get_context_attention_test_cases():
     test_cases = []
     b_list = [1, 2, 4, 8, 16, 32, 64, 128, 256]
     s_list = [16, 32, 64, 128, 256, 512, 1024, 1536, 2048, 3072, 4096, 6144, 8192, 10240, 12288, 16384, 262144]
-    n_list = [4, 8, 12, 16, 24, 32, 40, 48, 64, 96]
+    n_list = [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 64, 96]
     n_kv_list = [0, 1, 2, 4, 8]
     for n in sorted(n_list, reverse=True):
         for s in sorted(s_list, reverse=True):
@@ -74,8 +74,8 @@ def get_generation_attention_test_cases():
     # the i-th token to record. 1 for context phase. mapping to osl definition
     s_list = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
     # full n {4, 5, 7, 8, 9, 10, 12, 14, 16, 18, 20, 24, 28, 32, 36, 40, 48, 56, 72, 96}
-    n_list = [4, 8, 12, 16, 24, 32, 40, 48, 64]
-    n_list_xqa = [4, 8, 16, 32, 64, 96, 128]
+    n_list = [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 64]
+    n_list_xqa = [1, 2, 4, 8, 16, 32, 64, 96, 128]
     n_kv_list = [1, 2, 4, 8]
 
     # MHA
@@ -193,14 +193,7 @@ def run_attention_torch(
         k_cache, v_cache = [x.detach().to(kvtype).requires_grad_() for x in [k_cache, v_cache]]
         k, v, cache_seqlens = None, None, None
 
-    def float16attn_fp8kvcache(q, k_cache, v_cache, k, v, **kwargs):
-        k_cache = k_cache.to(torch.bfloat16)
-        v_cache = v_cache.to(torch.bfloat16)
-        k = None if k is None else k.to(torch.bfloat16)
-        v = None if v is None else v.to(torch.bfloat16)
-        flash_attn_func_v3(q, k_cache, v_cache, k, v, **kwargs)
-
-    if use_fp8_context_fmha:
+    if use_fp8_context_fmha or use_fp8_kv_cache:
         q = q.to(kvtype)
         m1 = time_fwd(
             flash_attn_func_v3,
@@ -210,19 +203,7 @@ def float16attn_fp8kvcache(q, k_cache, v_cache, k, v, **kwargs):
             k,
             v,
             cache_seqlens=cache_seqlens,
-            repeats=10,
-            verbose=True,
-            desc="Fav3",
-        )
-    elif use_fp8_kv_cache:
-        m1 = time_fwd(
-            float16attn_fp8kvcache,
-            q,
-            k_cache,
-            v_cache,
-            k,
-            v,
-            cache_seqlens=cache_seqlens,
+            causal=True,
             repeats=10,
             verbose=True,
             desc="Fav3",
@@ -236,6 +217,7 @@ def float16attn_fp8kvcache(q, k_cache, v_cache, k, v, **kwargs):
             k,
             v,
             cache_seqlens=cache_seqlens,
+            causal=True,
             repeats=10,
             verbose=True,
             desc="Fav3",
diff --git a/collector/sglang/collect_wideep_attn.py b/collector/sglang/collect_wideep_attn.py
diff --git a/collector/sglang/collect_wideep_deepep_moe.py b/collector/sglang/collect_wideep_deepep_moe.py
diff --git a/collector/sglang/collect_wideep_mlp.py b/collector/sglang/collect_wideep_mlp.py
diff --git a/collector/trtllm/collect_attn.py b/collector/trtllm/collect_attn.py
@@ -293,7 +293,7 @@ def get_context_attention_test_cases():
         16384,
         262144,
     ]
-    n_list = [4, 8, 12, 16, 24, 32, 40, 48, 64, 96]
+    n_list = [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 64, 96]
     n_kv_list = [0, 1, 2, 4, 8]
     head_dim = [64, 128]
 
@@ -507,8 +507,8 @@ def get_generation_attention_test_cases():
         65536,
         131072,
     ]
-    n_list = [4, 8, 12, 16, 24, 32, 40, 48, 64]
-    n_list_xqa = [4, 8, 16, 32, 64, 96, 128]
+    n_list = [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 64]
+    n_list_xqa = [1, 2, 4, 8, 16, 32, 64, 96, 128]
     n_kv_list = [1, 2, 4, 8]
     head_dim = [64, 128]
 
diff --git a/collector/vllm/collect_attn.py b/collector/vllm/collect_attn.py
@@ -304,7 +304,7 @@ def get_context_attention_test_cases(if_unit_test=False):
             16384,
             262144,
         ]
-        n_list = [4, 8, 12, 16, 24, 32, 40, 48, 64]
+        n_list = [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 64]
         n_kv_list = [0, 1, 2, 4, 8]
         # n_kv_list = [64]
     else:
@@ -360,7 +360,7 @@ def get_generation_attention_test_cases():
 
     b_list = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
     # b_list_xqa = [1,2,4,8,16,32,64,128,256,512,1024,2048]
-    n_list = [4, 8, 12, 16, 24, 32, 40, 48, 64]
+    n_list = [1, 2, 4, 8, 12, 16, 24, 32, 40, 48, 64]
     # n_list_xqa = [4,8,16,32,64,128]
     s_list = [
         2,

Original file line number	Diff line number	Diff line change
`@@ -738,13 +738,13 @@ def main():`
`738`	`738`	`)`
`739`	`739`	`parser.add_argument(`
`740`	`740`	`"--output-normal",`
`741`		`- default="./deepep_normal_perf.txt",`
`742`		`- help="normal output TXT file path (default: ./deepep_normal_perf.txt)",`
	`741`	`+ default="./wideep_deepep_normal_perf.txt",`
	`742`	`+ help="normal output TXT file path (default: ./wideep_deepep_normal_perf.txt)",`
`743`	`743`	`)`
`744`	`744`	`parser.add_argument(`
`745`	`745`	`"--output-ll",`
`746`		`- default="./deepep_ll_perf.txt",`
`747`		`- help="ll output TXT file path (default: ./deepep_ll_perf.txt)",`
	`746`	`+ default="./wideep_deepep_ll_perf.txt",`
	`747`	`+ help="ll output TXT file path (default: ./wideep_deepep_ll_perf.txt)",`
`748`	`748`	`)`
`749`	`749`	`args = parser.parse_args()`
`750`	`750`