|
| 1 | +// RUN: gc-opt %s --allocs-to-slm | FileCheck %s |
| 2 | + |
| 3 | +// Computex thread offset for SLM: (Xthread_idx * Yblock_sz * Zblock_sz + Ythread_idx * Zblock_sz + Zthread_idx) * Xchunk_size |
| 4 | +// CHECK: #map = affine_map<(d0, d1, d2) -> ((d0 * 12 + d1 * 4 + d2) * 256)> |
| 5 | + |
| 6 | +func.func @entry() { |
| 7 | + %c1 = arith.constant 1 : index |
| 8 | + %c2 = arith.constant 2 : index |
| 9 | + %c3 = arith.constant 3 : index |
| 10 | + %c4 = arith.constant 4 : index |
| 11 | + |
| 12 | + // Memory space wasn't assigned as it's allocated outside of gpu.launch block |
| 13 | + // CHECK: %[[NEW_MEMREF_0:.*]] = memref.alloc() : memref<256xf16> |
| 14 | + %0 = memref.alloc() : memref<256xf16> |
| 15 | + // Capture thread-id variables |
| 16 | + // CHECK: gpu.launch blocks(%[[ARG0:.+]], %[[ARG1:.+]], %[[ARG2:.+]]) in (%[[ARG6:.+]] = %c2, %[[ARG7:.+]] = %c2, %[[ARG8:.+]] = %c1) threads |
| 17 | + // CHECK-SAME: (%[[THREAD_X:.+]], %[[THREAD_Y:.+]], %[[THREAD_Z:.+]]) in |
| 18 | + // CHECK-SAME: (%[[ARG9:.+]] = %c2, %[[ARG10:.+]] = %c3, %[[ARG11:.+]] = %c4) { |
| 19 | + gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %c2, %sz_by = %c2, %sz_bz = %c1) |
| 20 | + threads(%tx, %ty, %tz) in (%sz_tx = %c2, %sz_ty = %c3, %sz_tz = %c4) { |
| 21 | + // Memory space was changed as it's explicitly specifided |
| 22 | + // CHECK: %[[NEW_MEMREF_1:.*]] = memref.alloc() : memref<256xf16, 1> |
| 23 | + %1 = memref.alloc() : memref<256xf16, 1> |
| 24 | + // Added 'shared' memory space and allocated SLM for each thread (2 * 3 * 4 = 24; 24 * 256 = 6144) |
| 25 | + // CHECK: %[[NEW_MEMREF_2:.*]] = memref.alloc() : memref<6144xf16, 3> |
| 26 | + // CHECK: %[[OFF_X:.*]] = affine.apply #map(%[[THREAD_X]], %[[THREAD_Y]], %[[THREAD_Z]]) |
| 27 | + // CHECK: %[[NEW_MEMREF_3:.*]] = memref.subview %[[NEW_MEMREF_2]][%[[OFF_X]]] [256] [1] |
| 28 | + // CHECK-SAME: memref<6144xf16, 3> to memref<256xf16, strided<[1], offset: ?>, 3> |
| 29 | + %2 = memref.alloc() : memref<256xf16> |
| 30 | + |
| 31 | + // CHECK: linalg.add ins(%[[NEW_MEMREF_1]], %[[NEW_MEMREF_3]] : |
| 32 | + // CHECK-SAME: memref<256xf16, 1>, memref<256xf16, strided<[1], offset: ?>, 3>) outs(%[[NEW_MEMREF_0]] : memref<256xf16>) |
| 33 | + linalg.add ins(%1, %2 :memref<256xf16, 1>, memref<256xf16>) outs(%0 : memref<256xf16>) |
| 34 | + // CHECK: memref.dealloc %[[NEW_MEMREF_1]] : memref<256xf16, 1> |
| 35 | + // Verify that there are no deallocs for SLM |
| 36 | + // CHECK-NOT: memref.dealloc %[[NEW_MEMREF_2]] .* |
| 37 | + // CHECK-NOT: memref.dealloc %[[NEW_MEMREF_3]] .* |
| 38 | + memref.dealloc %1 : memref<256xf16, 1> |
| 39 | + memref.dealloc %2 : memref<256xf16> |
| 40 | + gpu.terminator |
| 41 | + } |
| 42 | + return |
| 43 | +} |
0 commit comments