[BACKEND] Don't allocate shmem for warps with repeated data in tt.scan (#5910)

lezcano · web-flow · commit de650ad7ef9f · 2025-02-13T10:03:57.000-08:00
It turns out that the previous changes within reduce to support LLs had
already trimmed its shmem memory use to the right size.
diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp
@@ -267,7 +267,7 @@ bool ScanLoweringHelper::isSupported() {
 }
 
 unsigned ScanLoweringHelper::getScratchSizeInElems() {
-  unsigned numWarps = lookupNumWarps(scanOp);
+  unsigned numWarps = product(getEncoding().getWarpsPerCTA());
   unsigned numNonAxisElementsPerWarp =
       getNonAxisNumThreadsPerWarp() * getNonAxisNumElementsPerThread();
   unsigned numElements = numWarps * numNonAxisElementsPerWarp *
diff --git a/test/Analysis/test-allocation.mlir b/test/Analysis/test-allocation.mlir
@@ -615,4 +615,14 @@ tt.func @call_graph_2(%A : !tt.ptr<f16>, %cond : i1) {
   // CHECK-NEXT: size = 1024
 }
 
+// CHECK-LABEL: scan_alloc
+tt.func @scan_alloc(%x : tensor<8x16xf32, #AL>) {
+  // CHECK: offset = 0, size = 128
+  %a = "tt.scan"(%x) <{axis = 0 : i32, reverse = false}>({
+  ^bb0(%arg0: f32, %arg1: f32):
+    %add = arith.addf %arg0, %arg1 : f32
+    tt.scan.return %add : f32
+  }) : (tensor<8x16xf32, #AL>) -> tensor<8x16xf32, #AL>
+  tt.return
+}
 }

Original file line number	Diff line number	Diff line change
`@@ -267,7 +267,7 @@ bool ScanLoweringHelper::isSupported() {`
`267`	`267`	`}`
`268`	`268`
`269`	`269`	`unsigned ScanLoweringHelper::getScratchSizeInElems() {`
`270`		`- unsigned numWarps = lookupNumWarps(scanOp);`
	`270`	`+ unsigned numWarps = product(getEncoding().getWarpsPerCTA());`
`271`	`271`	`unsigned numNonAxisElementsPerWarp =`
`272`	`272`	`getNonAxisNumThreadsPerWarp() * getNonAxisNumElementsPerThread();`
`273`	`273`	`unsigned numElements = numWarps * numNonAxisElementsPerWarp *`
Original file line number	Diff line number	Diff line change
`@@ -615,4 +615,14 @@ tt.func @call_graph_2(%A : !tt.ptr<f16>, %cond : i1) {`
`615`	`615`	`// CHECK-NEXT: size = 1024`
`616`	`616`	`}`
`617`	`617`
	`618`	`+// CHECK-LABEL: scan_alloc`
	`619`	`+tt.func @scan_alloc(%x : tensor<8x16xf32, #AL>) {`
	`620`	`+ // CHECK: offset = 0, size = 128`
	`621`	`+ %a = "tt.scan"(%x) <{axis = 0 : i32, reverse = false}>({`
	`622`	`+ ^bb0(%arg0: f32, %arg1: f32):`
	`623`	`+ %add = arith.addf %arg0, %arg1 : f32`
	`624`	`+ tt.scan.return %add : f32`
	`625`	`+ }) : (tensor<8x16xf32, #AL>) -> tensor<8x16xf32, #AL>`
	`626`	`+ tt.return`
	`627`	`+}`
`618`	`628`	`}`