Broadcasts attention bias over query dimension

LoserCheems · LoserCheems · commit ade4ef3bc26e · 2025-11-13T12:37:11.000+08:00
Updates forward/backward equivalence benchmarks to create attention bias with a singleton query dimension so it broadcasts across queries.

Aligns shapes with kernel expectations during cached decoding, reduces memory footprint, and prevents shape mismatches across CUDA, Triton, and Flex paths.
diff --git a/benchmarks/backward_equivalence.py b/benchmarks/backward_equivalence.py
@@ -597,7 +597,7 @@ def test_cuda_backward_equivalence(accuracy_threshold=0.95):
             device=device, dtype=dtype, requires_grad=True
         )
         attn_bias = torch.randn(
-            batch_size, num_kv_heads, query_len, key_len,
+            batch_size, num_kv_heads, 1, key_len,
             device=device, dtype=torch.bfloat16
         )
         cache_position = torch.arange(key_len - query_len, key_len, device=device)
@@ -831,7 +831,7 @@ def test_triton_backward_equivalence(accuracy_threshold=0.95):
             device=device, dtype=dtype, requires_grad=True
         )
         attn_bias = torch.randn(
-            batch_size, num_kv_heads, query_len, key_len,
+            batch_size, num_kv_heads, 1, key_len,
             device=device, dtype=torch.bfloat16
         )
         cache_position = torch.arange(key_len - query_len, key_len, device=device)
diff --git a/benchmarks/forward_equivalence.py b/benchmarks/forward_equivalence.py
@@ -570,7 +570,7 @@ def test_cuda_forward_equivalence(accuracy_threshold=0.95):
             device=device, dtype=torch.bfloat16
         )
         attn_bias = torch.randn(
-            batch_size, num_kv_heads, query_len, key_len,
+            batch_size, num_kv_heads, 1, key_len,
             device=device, dtype=torch.bfloat16
         )
         cache_position = torch.arange(key_len - query_len, key_len, device=device)
@@ -758,7 +758,7 @@ def test_triton_forward_equivalence(accuracy_threshold=0.95):
             device=device, dtype=torch.bfloat16
         )
         attn_bias = torch.randn(
-            batch_size, num_kv_heads, query_len, key_len,
+            batch_size, num_kv_heads, 1, key_len,
             device=device, dtype=torch.bfloat16
         )
         cache_position = torch.arange(key_len - query_len, key_len, device=device)
@@ -963,7 +963,7 @@ def test_flex_forward_equivalence(accuracy_threshold=0.95):
             device=device, dtype=torch.bfloat16
         )
         attn_bias = torch.randn(
-            batch_size, num_kv_heads, query_len, key_len,
+            batch_size, num_kv_heads, 1, key_len,
             device=device, dtype=torch.bfloat16
         )
         cache_position = torch.arange(key_len - query_len, key_len, device=device)

Original file line number	Diff line number	Diff line change
`@@ -597,7 +597,7 @@ def test_cuda_backward_equivalence(accuracy_threshold=0.95):`
`597`	`597`	`device=device, dtype=dtype, requires_grad=True`
`598`	`598`	`)`
`599`	`599`	`attn_bias = torch.randn(`
`600`		`- batch_size, num_kv_heads, query_len, key_len,`
	`600`	`+ batch_size, num_kv_heads, 1, key_len,`
`601`	`601`	`device=device, dtype=torch.bfloat16`
`602`	`602`	`)`
`603`	`603`	`cache_position = torch.arange(key_len - query_len, key_len, device=device)`
`@@ -831,7 +831,7 @@ def test_triton_backward_equivalence(accuracy_threshold=0.95):`
`831`	`831`	`device=device, dtype=dtype, requires_grad=True`
`832`	`832`	`)`
`833`	`833`	`attn_bias = torch.randn(`
`834`		`- batch_size, num_kv_heads, query_len, key_len,`
	`834`	`+ batch_size, num_kv_heads, 1, key_len,`
`835`	`835`	`device=device, dtype=torch.bfloat16`
`836`	`836`	`)`
`837`	`837`	`cache_position = torch.arange(key_len - query_len, key_len, device=device)`
Original file line number	Diff line number	Diff line change
`@@ -570,7 +570,7 @@ def test_cuda_forward_equivalence(accuracy_threshold=0.95):`
`570`	`570`	`device=device, dtype=torch.bfloat16`
`571`	`571`	`)`
`572`	`572`	`attn_bias = torch.randn(`
`573`		`- batch_size, num_kv_heads, query_len, key_len,`
	`573`	`+ batch_size, num_kv_heads, 1, key_len,`
`574`	`574`	`device=device, dtype=torch.bfloat16`
`575`	`575`	`)`
`576`	`576`	`cache_position = torch.arange(key_len - query_len, key_len, device=device)`
`@@ -758,7 +758,7 @@ def test_triton_forward_equivalence(accuracy_threshold=0.95):`
`758`	`758`	`device=device, dtype=torch.bfloat16`
`759`	`759`	`)`
`760`	`760`	`attn_bias = torch.randn(`
`761`		`- batch_size, num_kv_heads, query_len, key_len,`
	`761`	`+ batch_size, num_kv_heads, 1, key_len,`
`762`	`762`	`device=device, dtype=torch.bfloat16`
`763`	`763`	`)`
`764`	`764`	`cache_position = torch.arange(key_len - query_len, key_len, device=device)`
`@@ -963,7 +963,7 @@ def test_flex_forward_equivalence(accuracy_threshold=0.95):`
`963`	`963`	`device=device, dtype=torch.bfloat16`
`964`	`964`	`)`
`965`	`965`	`attn_bias = torch.randn(`
`966`		`- batch_size, num_kv_heads, query_len, key_len,`
	`966`	`+ batch_size, num_kv_heads, 1, key_len,`
`967`	`967`	`device=device, dtype=torch.bfloat16`
`968`	`968`	`)`
`969`	`969`	`cache_position = torch.arange(key_len - query_len, key_len, device=device)`