|
1 | | -// RUN: triton-opt %s -split-input-file --convert-scf-to-cf --allocate-shared-memory -test-print-membar | FileCheck %s --check-prefix=CHECK --check-prefix=CF |
2 | | -// RUN: triton-opt %s -split-input-file --allocate-shared-memory -test-print-membar | FileCheck %s --check-prefix=CHECK --check-prefix=SCF |
3 | | -// RUN: triton-opt %s -split-input-file --convert-scf-to-cf --allocate-shared-memory -test-print-membar | FileCheck %s --check-prefix=CHECK --check-prefix=CF |
4 | | -// RUN: triton-opt %s -split-input-file --allocate-shared-memory -test-print-membar | FileCheck %s --check-prefix=CHECK --check-prefix=SCF |
| 1 | +// RUN: triton-opt %s -split-input-file --allocate-shared-memory -test-print-membar | FileCheck %s |
5 | 2 |
|
6 | 3 | #AL = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> |
7 | 4 | #sliceAd0 = #ttg.slice<{dim = 0, parent = #AL}> |
@@ -117,17 +114,6 @@ tt.func @async_wait(%arg: tensor<32x16xf16, #AL>) { |
117 | 114 | tt.return |
118 | 115 | } |
119 | 116 |
|
120 | | -// CHECK-LABEL: @async_store_wait |
121 | | -tt.func @async_store_wait(%arg: tensor<32x16xf16, #AL>) { |
122 | | - %alloc = ttg.local_alloc : () -> !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable> |
123 | | - // CHECK: async_tma_store_wait |
124 | | - ttng.async_tma_store_wait {pendings = 0 : i32} |
125 | | - // CHECK-NEXT: gpu.barrier |
126 | | - // CHECK-NEXT: ttg.local_store |
127 | | - ttg.local_store %arg, %alloc : tensor<32x16xf16, #AL> -> !ttg.memdesc<32x16xf16, #A_SHARED, #ttg.shared_memory, mutable> |
128 | | - tt.return |
129 | | -} |
130 | | - |
131 | 117 | // CHECK-LABEL: subview |
132 | 118 | tt.func @subview() { |
133 | 119 | %cst0 = arith.constant dense<0.000000e+00> : tensor<32x16xf16, #AL> |
@@ -743,111 +729,6 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar |
743 | 729 |
|
744 | 730 | // ----- |
745 | 731 |
|
746 | | -#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}> |
747 | | -#shared1 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}> |
748 | | -#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> |
749 | | - |
750 | | -module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 18944 : i32} { |
751 | | -// CHECK-LABEL: tma_special_cases |
752 | | -tt.func @tma_special_cases(%arg1: !tt.ptr<i8, 0>) -> (tensor<256x64xf16, #blocked>){ |
753 | | - %true = arith.constant 1 : i1 |
754 | | - %cx = arith.constant dense<1> : tensor<32xi32> |
755 | | - %c0 = arith.constant 0 : i32 |
756 | | - %barrier = ttg.local_alloc : () -> !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> |
757 | | - %alloc = ttg.local_alloc : () -> !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable> |
758 | | - // CHECK: ttng.init_barrier |
759 | | - // CHECK-NEXT: ttng.init_barrier |
760 | | - ttng.init_barrier %barrier, 1 : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> |
761 | | - ttng.init_barrier %barrier, 1 : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> |
762 | | - |
763 | | - // CHECK-NEXT: gpu.barrier |
764 | | - // CHECK-NEXT: ttng.barrier_expect |
765 | | - // CHECK-NEXT: ttng.async_tma_copy_global_to_local |
766 | | - // CHECK-NEXT: ttng.wait_barrier |
767 | | - ttng.barrier_expect %barrier, 49152, %true : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> |
768 | | - ttng.async_tma_copy_global_to_local %arg1[%c0, %c0] %alloc, %barrier, %true : !tt.ptr<i8, 0>, !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> -> !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable> |
769 | | - ttng.wait_barrier %barrier, %c0 : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> |
770 | | - |
771 | | - // CHECK-NEXT: ttng.async_tma_copy_global_to_local |
772 | | - // CHECK-NEXT: ttng.barrier_expect |
773 | | - // CHECK-NEXT: gpu.barrier |
774 | | - // CHECK-NEXT: ttng.wait_barrier |
775 | | - ttng.async_tma_copy_global_to_local %arg1[%c0, %c0] %alloc, %barrier, %true : !tt.ptr<i8, 0>, !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> -> !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable> |
776 | | - ttng.barrier_expect %barrier, 49152, %true : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> |
777 | | - ttng.wait_barrier %barrier, %c0 : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> |
778 | | - |
779 | | - // CHECK-NEXT: ttg.local_load |
780 | | - %t = ttg.local_load %alloc : !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable> -> tensor<256x64xf16, #blocked> |
781 | | - |
782 | | - // CHECK-NEXT: ttng.barrier_expect |
783 | | - // CHECK-NEXT: gpu.barrier |
784 | | - // CHECK-NEXT: ttng.async_tma_copy_global_to_local |
785 | | - // CHECK-NEXT: ttng.wait_barrier |
786 | | - ttng.barrier_expect %barrier, 49152, %true : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> |
787 | | - ttng.async_tma_copy_global_to_local %arg1[%c0, %c0] %alloc, %barrier, %true : !tt.ptr<i8, 0>, !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> -> !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable> |
788 | | - ttng.wait_barrier %barrier, %c0 : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> |
789 | | - |
790 | | - // CHECK-NEXT: memdesc_subview |
791 | | - // CHECK-NEXT: ttng.barrier_expect |
792 | | - // CHECK-NEXT: ttng.async_tma_gather |
793 | | - // CHECK-NEXT: gpu.barrier |
794 | | - // CHECK-NEXT: ttng.wait_barrier |
795 | | - %view = ttg.memdesc_subview %alloc[%c0, %c0] : !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable> -> !ttg.memdesc<32x64xf16, #shared, #ttg.shared_memory, mutable> |
796 | | - ttng.barrier_expect %barrier, 49152, %true : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> |
797 | | - ttng.async_tma_gather %arg1[%cx, %c0] %view, %barrier, %true : !tt.ptr<i8, 0>, tensor<32xi32>, i32, !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable>, !ttg.memdesc<32x64xf16, #shared, #ttg.shared_memory, mutable>, i1 |
798 | | - ttng.wait_barrier %barrier, %c0 : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> |
799 | | - |
800 | | - // CHECK-NEXT: gpu.barrier |
801 | | - // CHECK-NEXT: ttng.inval_barrier |
802 | | - // CHECK-NEXT: ttng.inval_barrier |
803 | | - ttng.inval_barrier %barrier : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> |
804 | | - ttng.inval_barrier %barrier : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> |
805 | | - |
806 | | - tt.return %t : tensor<256x64xf16, #blocked> |
807 | | -} |
808 | | -} |
809 | | - |
810 | | -// ----- |
811 | | - |
812 | | -#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}> |
813 | | -#shared1 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}> |
814 | | -#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> |
815 | | - |
816 | | -module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 18944 : i32} { |
817 | | -// CHECK-LABEL: tma_special_cases_cf |
818 | | -tt.func @tma_special_cases_cf(%arg1: !tt.ptr<i8, 0>, %i1 : i1, %arg2: tensor<256x64xf16, #blocked>) -> (tensor<256x64xf16, #blocked>){ |
819 | | - %true = arith.constant 1 : i1 |
820 | | - %c0 = arith.constant 0 : i32 |
821 | | - %barrier = ttg.local_alloc : () -> !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> |
822 | | - %alloc = ttg.local_alloc : () -> !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable> |
823 | | - // CF: cf.cond_br |
824 | | - // SCF: scf.if |
825 | | - scf.if %i1 { |
826 | | - // CHECK-NOT: gpu.barrier |
827 | | - // CHECK: ttng.async_tma_copy_global_to_local |
828 | | - // CHECK-NEXT: ttng.barrier_expect |
829 | | - // CHECK-NEXT: ttng.wait_barrier |
830 | | - // CF-NEXT: cf.br |
831 | | - // SCF-NEXT: } else { |
832 | | - ttng.async_tma_copy_global_to_local %arg1[%c0, %c0] %alloc, %barrier, %true : !tt.ptr<i8, 0>, !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> -> !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable> |
833 | | - ttng.barrier_expect %barrier, 49152, %true : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> |
834 | | - ttng.wait_barrier %barrier, %c0 : !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory, mutable> |
835 | | - } else { |
836 | | - // CHECK-NOT: gpu.barrier |
837 | | - // CHECK: ttg.local_store |
838 | | - // CF-NEXT: cf.br |
839 | | - // SCF-NEXT: } |
840 | | - ttg.local_store %arg2, %alloc : tensor<256x64xf16, #blocked> -> !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable> |
841 | | - } |
842 | | - // CHECK: gpu.barrier |
843 | | - // CHECK-NEXT: ttg.local_load |
844 | | - %t = ttg.local_load %alloc : !ttg.memdesc<256x64xf16, #shared, #ttg.shared_memory, mutable> -> tensor<256x64xf16, #blocked> |
845 | | - tt.return %t : tensor<256x64xf16, #blocked> |
846 | | -} |
847 | | -} |
848 | | - |
849 | | -// ----- |
850 | | - |
851 | 732 | #layout = #ttg.swizzled_shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [1, 0]}> |
852 | 733 | #smem = #ttg.shared_memory |
853 | 734 |
|
@@ -983,26 +864,6 @@ tt.func @direct_backedge_within_loop(%arg0: index, %arg1: index, %arg2: index, % |
983 | 864 |
|
984 | 865 | // ----- |
985 | 866 |
|
986 | | -// CHECK-LABEL: tmem_copy_after_alloc |
987 | | -#blocked = #ttg.blocked<{sizePerThread = [1, 16], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}> |
988 | | -#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}> |
989 | | -#smem = #ttg.shared_memory |
990 | | -#tmem_scales = #ttng.tensor_memory_scales_encoding<> |
991 | | -module attributes {"ttg.num-warps" = 4 : i32} { |
992 | | - tt.func @tmem_copy_after_alloc(%arg0: tensor<1x2048xf8E4M3FN, #blocked>) { |
993 | | - // CHECK: local_alloc |
994 | | - %0 = ttg.local_alloc %arg0 {allocation.offset = 53248 : i32} : (tensor<1x2048xf8E4M3FN, #blocked>) -> !ttg.memdesc<1x2048xf8E4M3FN, #shared, #smem> |
995 | | - // CHECK: tmem_alloc |
996 | | - %1 = ttng.tmem_alloc {tensor_memory_col_offset = 256 : i32, tensor_memory_row_offset = 0 : i32} : () -> !ttg.memdesc<128x16xf8E4M3FN, #tmem_scales, #ttng.tensor_memory, mutable> |
997 | | - // gpu.barrier |
998 | | - // CHECK: tmem_copy |
999 | | - ttng.tmem_copy %0, %1, : (!ttg.memdesc<1x2048xf8E4M3FN, #shared, #smem>, !ttg.memdesc<128x16xf8E4M3FN, #tmem_scales, #ttng.tensor_memory, mutable>) -> () |
1000 | | - tt.return |
1001 | | - } |
1002 | | -} |
1003 | | - |
1004 | | -// ----- |
1005 | | - |
1006 | 867 | #shared = #ttg.swizzled_shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [1, 0]}> |
1007 | 868 |
|
1008 | 869 | module attributes {"ttg.num-warps" = 4 : i32, "ttg.target" = "cuda:80"} { |
|
0 commit comments