neural: Fix a sync issue (#10281)

kaizhangNV · web-flow · commit 48d73cd2bbed · 2026-02-27T16:27:34.000Z
Close #10272. The root cause of this issue is the race condition. Add two syncs to avoid this issue. 1. At beginning of the mma loop -> This prevent the fast warp start writing the shared memory A in the i+1 iter when other slow warps are still reading the shared memory A. 2. In the backward, after mma and before outerproduceAccumulate -> becuase mma ends with warp-sync, so outerproduceAccumulate could start executing for some fast warps while the slow warps are still running mma. This PR also fix a numerical issue in the activation functions. Basically every function using exp() is not numerical stable, so update them.
diff --git a/source/standard-modules/neural/accelerate-vector-coopmat.slang b/source/standard-modules/neural/accelerate-vector-coopmat.slang
@@ -1044,6 +1044,11 @@ VISIBILITY_LEVEL struct MMAHelper<T, int InputSize, int OutputSize, int Subgroup
             }
             else
             {
+                // Ensure all warps finished reading shared memory from the previous
+                // tile iteration (or from the previous layer's output writeback in
+                // fused multi-layer kernels) before overwriting with new tile data.
+                GroupMemoryBarrierWithGroupSync();
+
                 loadShA<U, Address>(ptrAOffset[0], tileIndex, weightAddress);
                 loadVectorToShB(ptrBOffset[0], tileIndex, subgroupIndex, inputVector);
 
@@ -1271,6 +1276,13 @@ public struct WaveTangledVector<T, ShMemSize : ISharedMemorySize, int N, int Sub
         // outerProductAccumulate uses per-warp shared memory for both A (dOutput vectors)
         // and B (input vectors). The B region must start after ALL per-warp A regions to
         // avoid overlapping writes between warps (warp i's A would alias warp (i-1)'s B).
+        //
+        // Sync required: mma() above ends with warp-level sync only, but
+        // outerProductAccumulate reuses the same shared memory pool (starting at shA=0).
+        // Without a group sync, fast warps starting the outer product would corrupt
+        // slow warps still reading from mma's output writeback.
+        GroupMemoryBarrierWithGroupSync();
+
         static const int _outerRows = (MMA.M + MMA.CMShape.ROW_A - 1) / MMA.CMShape.ROW_A;
         static const int _outerPerWarpA = _outerRows * MMA.CMShape.ROW_A / MMA.CMShape.ElementCountPerVector * MMA.CMShape.COLUMN_A;
         uint shB_outer = shA + uint(getWaveCount() * _outerPerWarpA);
@@ -1284,6 +1296,8 @@ public struct WaveTangledVector<T, ShMemSize : ISharedMemorySize, int N, int Sub
                         >( doutput, shA, shB, shC, dWeightAddress.p, none);
         dthis = DifferentialPair<This>(dthis.p, dInput);
 
+        GroupMemoryBarrierWithGroupSync();
+
         static const int _outerRows2 = (MMA.M + MMA.CMShape.ROW_A - 1) / MMA.CMShape.ROW_A;
         static const int _outerPerWarpA2 = _outerRows2 * MMA.CMShape.ROW_A / MMA.CMShape.ElementCountPerVector * MMA.CMShape.COLUMN_A;
         uint shB_outer2 = shA + uint(getWaveCount() * _outerPerWarpA2);
diff --git a/source/standard-modules/neural/activations.slang b/source/standard-modules/neural/activations.slang
@@ -94,8 +94,10 @@ public struct Sigmoid<T> : IActivation<T>
         [ForceUnroll]
         for (int i = 0; i < Vector.Size; i++)
         {
+            // Numerically stable: exp() argument is always <= 0
             let x = input[i];
-            output[i] = T(1) / (T(1) + exp(-x));
+            let ex = exp(x >= T(0) ? -x : x);
+            output[i] = x >= T(0) ? T(1) / (T(1) + ex) : ex / (T(1) + ex);
         }
         return output;
     }
@@ -185,8 +187,10 @@ public struct SiLU<T> : IActivation<T>
         [ForceUnroll]
         for (int i = 0; i < Vector.Size; i++)
         {
+            // x * sigmoid(x), numerically stable: exp() argument is always <= 0
             let x = input[i];
-            output[i] = x / (T(1) + exp(-x));  // x * sigmoid(x)
+            let ex = exp(x >= T(0) ? -x : x);
+            output[i] = x >= T(0) ? x / (T(1) + ex) : x * ex / (T(1) + ex);
         }
         return output;
     }
@@ -212,8 +216,11 @@ public struct QuickGELU<T> : IActivation<T>
         [ForceUnroll]
         for (int i = 0; i < Vector.Size; i++)
         {
+            // x * sigmoid(1.702 * x), numerically stable: exp() argument is always <= 0
             let x = input[i];
-            output[i] = x / (T(1) + exp(T(-1.702) * x));  // x * sigmoid(1.702 * x)
+            let sx = T(1.702) * x;
+            let ex = exp(sx >= T(0) ? -sx : sx);
+            output[i] = sx >= T(0) ? x / (T(1) + ex) : x * ex / (T(1) + ex);
         }
         return output;
     }

Original file line number	Diff line number	Diff line change
`@@ -94,8 +94,10 @@ public struct Sigmoid<T> : IActivation<T>`
`94`	`94`	`[ForceUnroll]`
`95`	`95`	`for (int i = 0; i < Vector.Size; i++)`
`96`	`96`	`{`
	`97`	`+ // Numerically stable: exp() argument is always <= 0`
`97`	`98`	`let x = input[i];`
`98`		`- output[i] = T(1) / (T(1) + exp(-x));`
	`99`	`+ let ex = exp(x >= T(0) ? -x : x);`
	`100`	`+ output[i] = x >= T(0) ? T(1) / (T(1) + ex) : ex / (T(1) + ex);`
`99`	`101`	`}`
`100`	`102`	`return output;`
`101`	`103`	`}`
`@@ -185,8 +187,10 @@ public struct SiLU<T> : IActivation<T>`
`185`	`187`	`[ForceUnroll]`
`186`	`188`	`for (int i = 0; i < Vector.Size; i++)`
`187`	`189`	`{`
	`190`	`+ // x * sigmoid(x), numerically stable: exp() argument is always <= 0`
`188`	`191`	`let x = input[i];`
`189`		`- output[i] = x / (T(1) + exp(-x)); // x * sigmoid(x)`
	`192`	`+ let ex = exp(x >= T(0) ? -x : x);`
	`193`	`+ output[i] = x >= T(0) ? x / (T(1) + ex) : x * ex / (T(1) + ex);`
`190`	`194`	`}`
`191`	`195`	`return output;`
`192`	`196`	`}`
`@@ -212,8 +216,11 @@ public struct QuickGELU<T> : IActivation<T>`
`212`	`216`	`[ForceUnroll]`
`213`	`217`	`for (int i = 0; i < Vector.Size; i++)`
`214`	`218`	`{`
	`219`	`+ // x * sigmoid(1.702 * x), numerically stable: exp() argument is always <= 0`
`215`	`220`	`let x = input[i];`
`216`		`- output[i] = x / (T(1) + exp(T(-1.702) * x)); // x * sigmoid(1.702 * x)`
	`221`	`+ let sx = T(1.702) * x;`
	`222`	`+ let ex = exp(sx >= T(0) ? -sx : sx);`
	`223`	`+ output[i] = sx >= T(0) ? x / (T(1) + ex) : x * ex / (T(1) + ex);`
`217`	`224`	`}`
`218`	`225`	`return output;`
`219`	`226`	`}`