@@ -735,13 +735,18 @@ gpu.module @test_module {
735735 }
736736
737737 // CHECK-LABEL: func @gpu_shuffle_promote()
738- func.func @gpu_shuffle_promote () -> (f32 , f32 , f32 ) {
738+ func.func @gpu_shuffle_promote () -> (f32 , f32 , f32 , f32 , f32 ) {
739+ // CHECK: %[[#POISON:]] = llvm.mlir.poison : f32
740+ // CHECK: %[[#NEGWIDTH:]] = llvm.mlir.constant(-64 : i32) : i32
739741 // CHECK: %[[#VALUE:]] = llvm.mlir.constant(1.000000e+00 : f32) : f32
740742 %arg0 = arith.constant 1.0 : f32
741743 %arg1 = arith.constant 4 : i32
742744 %arg2 = arith.constant 16 : i32
743745 %arg3 = arith.constant 32 : i32
746+ // CHECK: %[[#WIDTH:]] = llvm.mlir.constant(64 : i32) : i32
744747 %arg4 = arith.constant 64 : i32
748+ // CHECK: %[[#C1:]] = llvm.mlir.constant(1 : i32) : i32
749+ %arg5 = arith.constant 1 : i32
745750 // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32
746751 // CHECK: %[[#MASK:]] = llvm.mlir.constant(4127 : i32) : i32
747752 // CHECK: %[[#PERMUTE:]] = rocdl.ds_swizzle %[[#CAST_VALUE]], %[[#MASK]] : (i32, i32) -> i32
@@ -763,7 +768,84 @@ gpu.module @test_module {
763768 // CHECK: %[[#SEL:]] = llvm.select %[[#CMP]], %[[#EXTRACT1]], %[[#EXTRACT0]] : i1, i32
764769 // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#SEL]] : i32 to f32
765770 %shfl3 , %pred3 = gpu.shuffle xor %arg0 , %arg3 , %arg4 : f32
766- func.return %shfl1 , %shfl2 , %shfl3 : f32 , f32 , f32
771+ // CHECK: %[[#LANE_ID:]] = rocdl.mbcnt.hi
772+ // CHECK: %[[#SUB:]] = llvm.sub %[[#LANE_ID]], %[[#C1]] : i32
773+ // CHECK: %[[#ADD:]] = llvm.add %[[#LANE_ID]], %[[#WIDTH]] : i32
774+ // CHECK: %[[#AND:]] = llvm.and %[[#ADD]], %[[#NEGWIDTH]] : i32
775+ // CHECK: %[[#VALID:]] = llvm.icmp "slt" %[[#SUB]], %[[#AND]] : i32
776+ // CHECK: %[[#PERMUTE:]] = rocdl.update.dpp %[[#VALUE]], %[[#VALUE]] with 312, 15, 15, false : f32
777+ // CHECK: %[[#SELECT:]] = llvm.select %[[#VALID]], %[[#PERMUTE]], %[[#POISON]] : i1, f32
778+ %shflu , %predu = gpu.shuffle up %arg0 , %arg5 , %arg4 : f32
779+ // CHECK: %[[#LANE_ID:]] = rocdl.mbcnt.hi
780+ // CHECK: %[[#OP:]] = llvm.add %[[#LANE_ID]], %[[#C1]] : i32
781+ // CHECK: %[[#ADD:]] = llvm.add %[[#LANE_ID]], %[[#WIDTH]] : i32
782+ // CHECK: %[[#AND:]] = llvm.and %[[#ADD]], %[[#NEGWIDTH]] : i32
783+ // CHECK: %[[#VALID:]] = llvm.icmp "slt" %[[#OP]], %[[#AND]] : i32
784+ // CHECK: %[[#PERMUTE:]] = rocdl.update.dpp %[[#VALUE]], %[[#VALUE]] with 304, 15, 15, false : f32
785+ // CHECK: %[[#SELECT:]] = llvm.select %[[#VALID]], %[[#PERMUTE]], %[[#POISON]] : i1, f32
786+ %shfld , %predd = gpu.shuffle down %arg0 , %arg5 , %arg4 : f32
787+ func.return %shfl1 , %shfl2 , %shfl3 , %shflu , %shfld : f32 , f32 , f32 , f32 , f32
788+ }
789+
790+ // CHECK-LABEL: func @gpu_butterfly_shuffle()
791+ func.func @gpu_butterfly_shuffle () -> (f32 , f32 , f32 , f32 , f32 , f32 ) {
792+ // CHECK: %[[#POISON:]] = llvm.mlir.poison : f32
793+ // CHECK: %[[#NEGWIDTH:]] = llvm.mlir.constant(-64 : i32) : i32
794+ // CHECK: %[[#VALUE:]] = llvm.mlir.constant(1.000000e+00 : f32) : f32
795+ %arg0 = arith.constant 1.0 : f32
796+ // CHECK: %[[#C1:]] = llvm.mlir.constant(1 : i32) : i32
797+ %c1 = arith.constant 1 : i32
798+ // CHECK: %[[#C2:]] = llvm.mlir.constant(2 : i32) : i32
799+ %c2 = arith.constant 2 : i32
800+ %c4 = arith.constant 4 : i32
801+ %c8 = arith.constant 8 : i32
802+ %c16 = arith.constant 16 : i32
803+ %c32 = arith.constant 32 : i32
804+ // CHECK: %[[#WIDTH:]] = llvm.mlir.constant(64 : i32) : i32
805+ %c64 = arith.constant 64 : i32
806+ // CHECK: %[[#LANE_ID:]] = rocdl.mbcnt.hi
807+ // CHECK: %[[#XOR:]] = llvm.xor %[[#LANE_ID]], %[[#C1]] : i32
808+ // CHECK: %[[#ADD:]] = llvm.add %[[#LANE_ID]], %[[#WIDTH]] : i32
809+ // CHECK: %[[#AND:]] = llvm.and %[[#ADD]], %[[#NEGWIDTH]] : i32
810+ // CHECK: %[[#VALID:]] = llvm.icmp "slt" %[[#XOR]], %[[#AND]] : i32
811+ // CHECK: %[[#PERMUTE:]] = rocdl.update.dpp %[[#VALUE]], %[[#VALUE]] with 177, 15, 15, false : f32
812+ // CHECK: %[[#SELECT:]] = llvm.select %[[#VALID]], %[[#PERMUTE]], %[[#POISON]] : i1, f32
813+ %shfl1 , %pred1 = gpu.shuffle xor %arg0 , %c1 , %c64 : f32
814+ // CHECK: %[[#LANE_ID:]] = rocdl.mbcnt.hi
815+ // CHECK: %[[#XOR:]] = llvm.xor %[[#LANE_ID]], %[[#C2]] : i32
816+ // CHECK: %[[#ADD:]] = llvm.add %[[#LANE_ID]], %[[#WIDTH]] : i32
817+ // CHECK: %[[#AND:]] = llvm.and %[[#ADD]], %[[#NEGWIDTH]] : i32
818+ // CHECK: %[[#VALID:]] = llvm.icmp "slt" %[[#XOR]], %[[#AND]] : i32
819+ // CHECK: %[[#PERMUTE:]] = rocdl.update.dpp %[[#VALUE]], %[[#VALUE]] with 78, 15, 15, false : f32
820+ // CHECK: %[[#SELECT:]] = llvm.select %[[#VALID]], %[[#PERMUTE]], %[[#POISON]] : i1, f32
821+ %shfl2 , %pred2 = gpu.shuffle xor %arg0 , %c2 , %c64 : f32
822+ // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32
823+ // CHECK: %[[#MASK:]] = llvm.mlir.constant(4127 : i32) : i32
824+ // CHECK: %[[#PERMUTE:]] = rocdl.ds_swizzle %[[#CAST_VALUE]], %[[#MASK]] : (i32, i32) -> i32
825+ // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#PERMUTE]] : i32 to f32
826+ %shfl3 , %pred3 = gpu.shuffle xor %arg0 , %c4 , %c64 : f32
827+ // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32
828+ // CHECK: %[[#MASK:]] = llvm.mlir.constant(8223 : i32) : i32
829+ // CHECK: %[[#PERMUTE:]] = rocdl.ds_swizzle %[[#CAST_VALUE]], %[[#MASK]] : (i32, i32) -> i32
830+ // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#PERMUTE]] : i32 to f32
831+ %shfl4 , %pred4 = gpu.shuffle xor %arg0 , %c8 , %c64 : f32
832+ // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32
833+ // CHECK: %[[#PERMUTE:]] = rocdl.permlane16.swap %[[#CAST_VALUE]], %[[#CAST_VALUE]], false, false : (i32, i32) -> <(i32, i32)>
834+ // CHECK: %[[#EXTRACT0:]] = llvm.extractvalue %[[#PERMUTE:]][0] : !llvm.struct<(i32, i32)>
835+ // CHECK: %[[#EXTRACT1:]] = llvm.extractvalue %[[#PERMUTE:]][1] : !llvm.struct<(i32, i32)>
836+ // CHECK: %[[#CMP:]] = llvm.icmp "eq" %[[#EXTRACT0]], %[[#CAST_VALUE]] : i32
837+ // CHECK: %[[#SEL:]] = llvm.select %[[#CMP]], %[[#EXTRACT1]], %[[#EXTRACT0]] : i1, i32
838+ // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#SEL]] : i32 to f32
839+ %shfl5 , %pred5 = gpu.shuffle xor %arg0 , %c16 , %c64 : f32
840+ // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32
841+ // CHECK: %[[#PERMUTE:]] = rocdl.permlane32.swap %[[#CAST_VALUE]], %[[#CAST_VALUE]], false, false : (i32, i32) -> <(i32, i32)>
842+ // CHECK: %[[#EXTRACT0:]] = llvm.extractvalue %[[#PERMUTE:]][0] : !llvm.struct<(i32, i32)>
843+ // CHECK: %[[#EXTRACT1:]] = llvm.extractvalue %[[#PERMUTE:]][1] : !llvm.struct<(i32, i32)>
844+ // CHECK: %[[#CMP:]] = llvm.icmp "eq" %[[#EXTRACT0]], %[[#CAST_VALUE]] : i32
845+ // CHECK: %[[#SEL:]] = llvm.select %[[#CMP]], %[[#EXTRACT1]], %[[#EXTRACT0]] : i1, i32
846+ // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#SEL]] : i32 to f32
847+ %shfl6 , %pred6 = gpu.shuffle xor %arg0 , %c32 , %c64 : f32
848+ func.return %shfl1 , %shfl2 , %shfl3 , %shfl4 , %shfl5 , %shfl6 : f32 , f32 , f32 , f32 , f32 , f32
767849 }
768850
769851 // CHECK-LABEL: func @gpu_shuffle_vec
0 commit comments