[EXP] Mask off out of bound dot products

alihassanijr · alihassanijr · commit c402a65a21a2 · 2025-08-20T12:37:10.000-04:00
WARNING: does not successfully pass the entire unit test sweep.
Does not affect existing functionality, and only fails to pass with the
same error threshold as existing use cases (without clipping)
diff --git a/csrc/include/natten/cuda/fna/epilogue/epilogue_rescale_output.h b/csrc/include/natten/cuda/fna/epilogue/epilogue_rescale_output.h
@@ -179,7 +179,9 @@ class MemoryEfficientAttentionNormalize {
     multiplies<ComputeFragment> mul_add_source;
     multiply_add<ComputeFragment> mul_add_accumulator;
 
-    ElementCompute alpha = isLast ? (1 / s_prime_[row]) : 1;
+    auto s_prime = s_prime_[row];
+    auto scale = s_prime == 0 ? 0 : 1 / s_prime;
+    ElementCompute alpha = isLast ? scale : 1;
     ElementCompute beta = alpha * m_prime_[row];
 
     intermediate = mul_add_source(beta, converted_source); // X =  beta * C
@@ -209,7 +211,9 @@ class MemoryEfficientAttentionNormalize {
     ComputeFragment intermediate;
     multiplies<ComputeFragment> mul_accumulator;
 
-    ElementCompute alpha = isLast ? (1 / s_prime_[row]) : 1;
+    auto s_prime = s_prime_[row];
+    auto scale = s_prime == 0 ? 0 : 1 / s_prime;
+    ElementCompute alpha = isLast ? scale : 1;
 
     intermediate = mul_accumulator(
         alpha, converted_accumulator); // X =  alpha * C + uniform
diff --git a/csrc/include/natten/cuda/fna/kernel_backward.h b/csrc/include/natten/cuda/fna/kernel_backward.h
@@ -1639,30 +1639,34 @@ struct FusedNeighborhoodAttentionBackwardKernel {
       auto lane_offset = MatmulQK::AccumLambdaIterator::get_lane_offset(
           lane_id, warp_id, output_tile_coords);
 
-      // (Optional) clip dot products -- MUST BE DONE PRIOR TO MASKING &
-      // SCALING.
+      // Dot product scale
+      accum = cutlass::multiplies<typename Mma::FragmentC>()(scale, accum);
+
+      // (Optional) clip dot products (mask off out of bound dot products)
       if (p.has_dot_product_clip) {
         if (not p.has_dot_product_max) {
           for (int i = 0; i < Mma::FragmentC::kElements; ++i) {
-            accum[i] = cutlass::fast_max(accum[i], p.dot_product_min);
+            accum[i] = accum[i] < p.dot_product_min
+                ? -cutlass::platform::numeric_limits<accum_t>::infinity()
+                : accum[i];
           }
         } else if (not p.has_dot_product_min) {
           for (int i = 0; i < Mma::FragmentC::kElements; ++i) {
-            accum[i] = cutlass::fast_min(accum[i], p.dot_product_max);
+            accum[i] = accum[i] > p.dot_product_max
+                ? -cutlass::platform::numeric_limits<accum_t>::infinity()
+                : accum[i];
           }
         } else {
           // assert(p.has_dot_product_min && p.has_dot_product_max);
           for (int i = 0; i < Mma::FragmentC::kElements; ++i) {
-            accum[i] = cutlass::fast_max(
-                cutlass::fast_min(accum[i], p.dot_product_max),
-                p.dot_product_min);
+            accum[i] =
+                (accum[i] < p.dot_product_min || accum[i] > p.dot_product_max)
+                ? -cutlass::platform::numeric_limits<accum_t>::infinity()
+                : accum[i];
           }
         }
       }
 
-      // Dot product scale
-      accum = cutlass::multiplies<typename Mma::FragmentC>()(scale, accum);
-
       if (not p.is_fully_block_sparse) {
         // Neighborhood Attention masking
         Dim first_col, query_bound, row_idx;
diff --git a/csrc/include/natten/cuda/fna/kernel_forward.h b/csrc/include/natten/cuda/fna/kernel_forward.h
@@ -745,23 +745,30 @@ struct FusedNeighborhoodAttentionKernel {
         MM1::Mma::drain_cp_asyncs();
       }
 
-      // (Optional) clip dot products -- MUST BE DONE PRIOR TO MASKING &
-      // SCALING.
+      // (Optional) clip dot products (mask off out of bound dot products)
       if (p.has_dot_product_clip) {
         if (not p.has_dot_product_max) {
           for (int i = 0; i < MM0::Mma::FragmentC::kElements; ++i) {
-            accum[i] = cutlass::fast_max(accum[i], p.dot_product_min);
+            accum[i] = accum[i] * p.scale;
+            accum[i] = accum[i] < p.dot_product_min
+                ? -cutlass::platform::numeric_limits<accum_t>::infinity()
+                : accum[i];
           }
         } else if (not p.has_dot_product_min) {
           for (int i = 0; i < MM0::Mma::FragmentC::kElements; ++i) {
-            accum[i] = cutlass::fast_min(accum[i], p.dot_product_max);
+            accum[i] = accum[i] * p.scale;
+            accum[i] = accum[i] > p.dot_product_max
+                ? -cutlass::platform::numeric_limits<accum_t>::infinity()
+                : accum[i];
           }
         } else {
           // assert(p.has_dot_product_min && p.has_dot_product_max);
           for (int i = 0; i < MM0::Mma::FragmentC::kElements; ++i) {
-            accum[i] = cutlass::fast_max(
-                cutlass::fast_min(accum[i], p.dot_product_max),
-                p.dot_product_min);
+            accum[i] = accum[i] * p.scale;
+            accum[i] =
+                (accum[i] < p.dot_product_min || accum[i] > p.dot_product_max)
+                ? -cutlass::platform::numeric_limits<accum_t>::infinity()
+                : accum[i];
           }
         }
       }
@@ -823,7 +830,7 @@ struct FusedNeighborhoodAttentionKernel {
           last_kv_col,
           is_first_kv_iter,
           iteratorC_tile_offset,
-          p.scale);
+          p.has_dot_product_clip ? 1.0 : p.scale);
 
       // Output results to shared-memory
 
@@ -999,8 +1006,13 @@ struct FusedNeighborhoodAttentionKernel {
           map_index_to_coord((int32_t)thread_id(), problem_size_0_m);
       auto query_offset = (query_idx * p.lse_strideM).sum();
       if (is_coord_within_upper_bound(query_idx, problem_size_0_m)) {
-        p.logsumexp_ptr[query_offset] = accum_t(mi[thread_id()] / kLog2e) +
-            cutlass::fast_log(accum_t(s_prime[thread_id()]));
+        if (mi[thread_id()] ==
+            -cutlass::platform::numeric_limits<accum_t>::infinity()) {
+          p.logsumexp_ptr[query_offset] = 0.0f;
+        } else {
+          p.logsumexp_ptr[query_offset] = accum_t(mi[thread_id()] / kLog2e) +
+              cutlass::fast_log(accum_t(s_prime[thread_id()]));
+        }
         //} else if (query_offset < lse_dim) {
         //  p.logsumexp_ptr[query_offset] =
         //      cutlass::platform::numeric_limits<accum_t>::infinity();
diff --git a/csrc/include/natten/cuda/reference/fna_reference_backward.hpp b/csrc/include/natten/cuda/reference/fna_reference_backward.hpp
@@ -116,23 +116,30 @@ void __global__ fna_bwd_reference_dQ_kernel(
             acc_doo += mDO(idx_Q, idx_D1, idx_L) * mO(idx_Q, idx_D1, idx_L);
           } // for idx_D1
 
-          // (Optional) clip dot products -- MUST BE DONE PRIOR TO MASKING &
-          // SCALING.
+          acc_qk *= attn_scale;
+          acc_dov *= attn_scale;
+          acc_doo *= attn_scale;
+
+          // (Optional) clip dot products (mask off out of bound dot products)
           if (has_dot_product_min || has_dot_product_max) {
             if (not has_dot_product_max) {
-              acc_qk = cutlass::fast_max(acc_qk, dot_product_min);
+              acc_qk = acc_qk < dot_product_min
+                  ? -cutlass::platform::numeric_limits<
+                        ElementAccumulator>::infinity()
+                  : acc_qk;
             } else if (not has_dot_product_min) {
-              acc_qk = cutlass::fast_min(acc_qk, dot_product_max);
+              acc_qk = acc_qk > dot_product_max
+                  ? -cutlass::platform::numeric_limits<
+                        ElementAccumulator>::infinity()
+                  : acc_qk;
             } else {
-              acc_qk = cutlass::fast_max(
-                  cutlass::fast_min(acc_qk, dot_product_max), dot_product_min);
+              acc_qk = (acc_qk < dot_product_min || acc_qk > dot_product_max)
+                  ? -cutlass::platform::numeric_limits<
+                        ElementAccumulator>::infinity()
+                  : acc_qk;
             }
           }
 
-          acc_qk *= attn_scale;
-          acc_dov *= attn_scale;
-          acc_doo *= attn_scale;
-
           auto id = make_identity_tensor(make_shape(1, 1));
           auto frag = make_tensor<ElementAccumulator>(Shape<_1, _1>{});
           frag(0) = acc_qk;
@@ -246,23 +253,30 @@ void __global__ fna_bwd_reference_dK_kernel(
             acc_doo += mDO(idx_Q, idx_D1, idx_L) * mO(idx_Q, idx_D1, idx_L);
           } // for idx_D1
 
-          // (Optional) clip dot products -- MUST BE DONE PRIOR TO MASKING &
-          // SCALING.
+          acc_qk *= attn_scale;
+          acc_dov *= attn_scale;
+          acc_doo *= attn_scale;
+
+          // (Optional) clip dot products (mask off out of bound dot products)
           if (has_dot_product_min || has_dot_product_max) {
             if (not has_dot_product_max) {
-              acc_qk = cutlass::fast_max(acc_qk, dot_product_min);
+              acc_qk = acc_qk < dot_product_min
+                  ? -cutlass::platform::numeric_limits<
+                        ElementAccumulator>::infinity()
+                  : acc_qk;
             } else if (not has_dot_product_min) {
-              acc_qk = cutlass::fast_min(acc_qk, dot_product_max);
+              acc_qk = acc_qk > dot_product_max
+                  ? -cutlass::platform::numeric_limits<
+                        ElementAccumulator>::infinity()
+                  : acc_qk;
             } else {
-              acc_qk = cutlass::fast_max(
-                  cutlass::fast_min(acc_qk, dot_product_max), dot_product_min);
+              acc_qk = (acc_qk < dot_product_min || acc_qk > dot_product_max)
+                  ? -cutlass::platform::numeric_limits<
+                        ElementAccumulator>::infinity()
+                  : acc_qk;
             }
           }
 
-          acc_qk *= attn_scale;
-          acc_dov *= attn_scale;
-          acc_doo *= attn_scale;
-
           auto id = make_identity_tensor(make_shape(1, 1));
           auto frag = make_tensor<ElementAccumulator>(Shape<_1, _1>{});
           frag(0) = acc_qk;
@@ -374,21 +388,28 @@ void __global__ fna_bwd_reference_dV_kernel(
             acc_qk += rQ * rK;
           } // for idx_D0
 
-          // (Optional) clip dot products -- MUST BE DONE PRIOR TO MASKING &
-          // SCALING.
+          acc_qk *= attn_scale;
+
+          // (Optional) clip dot products (mask off out of bound dot products)
           if (has_dot_product_min || has_dot_product_max) {
             if (not has_dot_product_max) {
-              acc_qk = cutlass::fast_max(acc_qk, dot_product_min);
+              acc_qk = acc_qk < dot_product_min
+                  ? -cutlass::platform::numeric_limits<
+                        ElementAccumulator>::infinity()
+                  : acc_qk;
             } else if (not has_dot_product_min) {
-              acc_qk = cutlass::fast_min(acc_qk, dot_product_max);
+              acc_qk = acc_qk > dot_product_max
+                  ? -cutlass::platform::numeric_limits<
+                        ElementAccumulator>::infinity()
+                  : acc_qk;
             } else {
-              acc_qk = cutlass::fast_max(
-                  cutlass::fast_min(acc_qk, dot_product_max), dot_product_min);
+              acc_qk = (acc_qk < dot_product_min || acc_qk > dot_product_max)
+                  ? -cutlass::platform::numeric_limits<
+                        ElementAccumulator>::infinity()
+                  : acc_qk;
             }
           }
 
-          acc_qk *= attn_scale;
-
           auto id = make_identity_tensor(make_shape(1, 1));
           auto frag = make_tensor<ElementAccumulator>(Shape<_1, _1>{});
           frag(0) = acc_qk;
diff --git a/csrc/include/natten/cuda/reference/fna_reference_forward.hpp b/csrc/include/natten/cuda/reference/fna_reference_forward.hpp
@@ -138,20 +138,26 @@ void __global__ fna_reference_kernel(
             acc += eQ * eK;
           }
 
-          // (Optional) clip dot products -- MUST BE DONE PRIOR TO MASKING &
-          // SCALING.
+          acc = acc * attn_scale;
+
+          // (Optional) clip dot products (mask off out of bound dot products)
           if (has_dot_product_min || has_dot_product_max) {
             if (not has_dot_product_max) {
-              acc = cutlass::fast_max(acc, dot_product_min);
+              acc = acc < dot_product_min ? -cutlass::platform::numeric_limits<
+                                                ElementAccumulator>::infinity()
+                                          : acc;
             } else if (not has_dot_product_min) {
-              acc = cutlass::fast_min(acc, dot_product_max);
+              acc = acc > dot_product_max ? -cutlass::platform::numeric_limits<
+                                                ElementAccumulator>::infinity()
+                                          : acc;
             } else {
-              acc = cutlass::fast_max(
-                  cutlass::fast_min(acc, dot_product_max), dot_product_min);
+              acc = (acc < dot_product_min || acc > dot_product_max)
+                  ? -cutlass::platform::numeric_limits<
+                        ElementAccumulator>::infinity()
+                  : acc;
             }
           }
 
-          acc = acc * attn_scale;
           auto frag = make_tensor<ElementAccumulator>(Shape<_1, _1>{});
           frag(0) = acc;
           attention_mask.apply_mask(
@@ -212,17 +218,17 @@ void __global__ fna_reference_kernel(
         __syncthreads();
       }
 
+      ElementAccumulator scale = sum == 0.0f ? 0.0f : 1.0f / sum;
       for (int i = 0; i < DimPerThread; ++i) {
         int idx_D = threadIdx.x + i * blockDim.x;
         if (idx_D < size<1>(mO)) {
-          ElementAccumulator scale = 1.0f / sum;
           mO(idx_Q + offset_Q, idx_D, idx_L) =
               static_cast<typename TensorO::value_type>(final_acc[i] * scale);
         }
       }
 
       if (threadIdx.x == 0) {
-        mLSE(idx_Q + offset_Q, idx_L) = log(sum) + maxS;
+        mLSE(idx_Q + offset_Q, idx_L) = sum == 0.0f ? 0.0f : (log(sum) + maxS);
       }
     }
   }

Original file line number	Diff line number	Diff line change
`@@ -138,20 +138,26 @@ void __global__ fna_reference_kernel(`
`138`	`138`	`acc += eQ * eK;`
`139`	`139`	`}`
`140`	`140`
`141`		`- // (Optional) clip dot products -- MUST BE DONE PRIOR TO MASKING &`
`142`		`- // SCALING.`
	`141`	`+ acc = acc * attn_scale;`
	`142`	`+`
	`143`	`+ // (Optional) clip dot products (mask off out of bound dot products)`
`143`	`144`	`if (has_dot_product_min \|\| has_dot_product_max) {`
`144`	`145`	`if (not has_dot_product_max) {`
`145`		`- acc = cutlass::fast_max(acc, dot_product_min);`
	`146`	`+ acc = acc < dot_product_min ? -cutlass::platform::numeric_limits<`
	`147`	`+ ElementAccumulator>::infinity()`
	`148`	`+ : acc;`
`146`	`149`	`} else if (not has_dot_product_min) {`
`147`		`- acc = cutlass::fast_min(acc, dot_product_max);`
	`150`	`+ acc = acc > dot_product_max ? -cutlass::platform::numeric_limits<`
	`151`	`+ ElementAccumulator>::infinity()`
	`152`	`+ : acc;`
`148`	`153`	`} else {`
`149`		`- acc = cutlass::fast_max(`
`150`		`- cutlass::fast_min(acc, dot_product_max), dot_product_min);`
	`154`	`+ acc = (acc < dot_product_min \|\| acc > dot_product_max)`
	`155`	`+ ? -cutlass::platform::numeric_limits<`
	`156`	`+ ElementAccumulator>::infinity()`
	`157`	`+ : acc;`
`151`	`158`	`}`
`152`	`159`	`}`
`153`	`160`
`154`		`- acc = acc * attn_scale;`
`155`	`161`	`auto frag = make_tensor<ElementAccumulator>(Shape<_1, _1>{});`
`156`	`162`	`frag(0) = acc;`
`157`	`163`	`attention_mask.apply_mask(`
`@@ -212,17 +218,17 @@ void __global__ fna_reference_kernel(`
`212`	`218`	`__syncthreads();`
`213`	`219`	`}`
`214`	`220`
	`221`	`+ ElementAccumulator scale = sum == 0.0f ? 0.0f : 1.0f / sum;`
`215`	`222`	`for (int i = 0; i < DimPerThread; ++i) {`
`216`	`223`	`int idx_D = threadIdx.x + i * blockDim.x;`
`217`	`224`	`if (idx_D < size<1>(mO)) {`
`218`		`- ElementAccumulator scale = 1.0f / sum;`
`219`	`225`	`mO(idx_Q + offset_Q, idx_D, idx_L) =`
`220`	`226`	`static_cast<typename TensorO::value_type>(final_acc[i] * scale);`
`221`	`227`	`}`
`222`	`228`	`}`
`223`	`229`
`224`	`230`	`if (threadIdx.x == 0) {`
`225`		`- mLSE(idx_Q + offset_Q, idx_L) = log(sum) + maxS;`
	`231`	`+ mLSE(idx_Q + offset_Q, idx_L) = sum == 0.0f ? 0.0f : (log(sum) + maxS);`
`226`	`232`	`}`
`227`	`233`	`}`
`228`	`234`	`}`