diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td index ea2c80eaf9583..493c0cfcab60c 100644 --- a/llvm/include/llvm/CodeGen/ValueTypes.td +++ b/llvm/include/llvm/CodeGen/ValueTypes.td @@ -179,157 +179,158 @@ def v128f16 : VTVec<128, f16, 96>; // 128 x f16 vector value def v256f16 : VTVec<256, f16, 97>; // 256 x f16 vector value def v512f16 : VTVec<512, f16, 98>; // 512 x f16 vector value -def v2bf16 : VTVec<2, bf16, 99>; // 2 x bf16 vector value -def v3bf16 : VTVec<3, bf16, 100>; // 3 x bf16 vector value -def v4bf16 : VTVec<4, bf16, 101>; // 4 x bf16 vector value -def v8bf16 : VTVec<8, bf16, 102>; // 8 x bf16 vector value -def v16bf16 : VTVec<16, bf16, 103>; // 16 x bf16 vector value -def v32bf16 : VTVec<32, bf16, 104>; // 32 x bf16 vector value -def v64bf16 : VTVec<64, bf16, 105>; // 64 x bf16 vector value -def v128bf16 : VTVec<128, bf16, 106>; // 128 x bf16 vector value - -def v1f32 : VTVec<1, f32, 107>; // 1 x f32 vector value -def v2f32 : VTVec<2, f32, 108>; // 2 x f32 vector value -def v3f32 : VTVec<3, f32, 109>; // 3 x f32 vector value -def v4f32 : VTVec<4, f32, 110>; // 4 x f32 vector value -def v5f32 : VTVec<5, f32, 111>; // 5 x f32 vector value -def v6f32 : VTVec<6, f32, 112>; // 6 x f32 vector value -def v7f32 : VTVec<7, f32, 113>; // 7 x f32 vector value -def v8f32 : VTVec<8, f32, 114>; // 8 x f32 vector value -def v9f32 : VTVec<9, f32, 115>; // 9 x f32 vector value -def v10f32 : VTVec<10, f32, 116>; // 10 x f32 vector value -def v11f32 : VTVec<11, f32, 117>; // 11 x f32 vector value -def v12f32 : VTVec<12, f32, 118>; // 12 x f32 vector value -def v16f32 : VTVec<16, f32, 119>; // 16 x f32 vector value -def v32f32 : VTVec<32, f32, 120>; // 32 x f32 vector value -def v64f32 : VTVec<64, f32, 121>; // 64 x f32 vector value -def v128f32 : VTVec<128, f32, 122>; // 128 x f32 vector value -def v256f32 : VTVec<256, f32, 123>; // 256 x f32 vector value -def v512f32 : VTVec<512, f32, 124>; // 512 x f32 vector value -def v1024f32 : VTVec<1024, f32, 125>; // 1024 x f32 vector value -def v2048f32 : VTVec<2048, f32, 126>; // 2048 x f32 vector value - -def v1f64 : VTVec<1, f64, 127>; // 1 x f64 vector value -def v2f64 : VTVec<2, f64, 128>; // 2 x f64 vector value -def v3f64 : VTVec<3, f64, 129>; // 3 x f64 vector value -def v4f64 : VTVec<4, f64, 130>; // 4 x f64 vector value -def v8f64 : VTVec<8, f64, 131>; // 8 x f64 vector value -def v16f64 : VTVec<16, f64, 132>; // 16 x f64 vector value -def v32f64 : VTVec<32, f64, 133>; // 32 x f64 vector value -def v64f64 : VTVec<64, f64, 134>; // 64 x f64 vector value -def v128f64 : VTVec<128, f64, 135>; // 128 x f64 vector value -def v256f64 : VTVec<256, f64, 136>; // 256 x f64 vector value - -def nxv1i1 : VTScalableVec<1, i1, 137>; // n x 1 x i1 vector value -def nxv2i1 : VTScalableVec<2, i1, 138>; // n x 2 x i1 vector value -def nxv4i1 : VTScalableVec<4, i1, 139>; // n x 4 x i1 vector value -def nxv8i1 : VTScalableVec<8, i1, 140>; // n x 8 x i1 vector value -def nxv16i1 : VTScalableVec<16, i1, 141>; // n x 16 x i1 vector value -def nxv32i1 : VTScalableVec<32, i1, 142>; // n x 32 x i1 vector value -def nxv64i1 : VTScalableVec<64, i1, 143>; // n x 64 x i1 vector value - -def nxv1i8 : VTScalableVec<1, i8, 144>; // n x 1 x i8 vector value -def nxv2i8 : VTScalableVec<2, i8, 145>; // n x 2 x i8 vector value -def nxv4i8 : VTScalableVec<4, i8, 146>; // n x 4 x i8 vector value -def nxv8i8 : VTScalableVec<8, i8, 147>; // n x 8 x i8 vector value -def nxv16i8 : VTScalableVec<16, i8, 148>; // n x 16 x i8 vector value -def nxv32i8 : VTScalableVec<32, i8, 149>; // n x 32 x i8 vector value -def nxv64i8 : VTScalableVec<64, i8, 150>; // n x 64 x i8 vector value - -def nxv1i16 : VTScalableVec<1, i16, 151>; // n x 1 x i16 vector value -def nxv2i16 : VTScalableVec<2, i16, 152>; // n x 2 x i16 vector value -def nxv4i16 : VTScalableVec<4, i16, 153>; // n x 4 x i16 vector value -def nxv8i16 : VTScalableVec<8, i16, 154>; // n x 8 x i16 vector value -def nxv16i16 : VTScalableVec<16, i16, 155>; // n x 16 x i16 vector value -def nxv32i16 : VTScalableVec<32, i16, 156>; // n x 32 x i16 vector value - -def nxv1i32 : VTScalableVec<1, i32, 157>; // n x 1 x i32 vector value -def nxv2i32 : VTScalableVec<2, i32, 158>; // n x 2 x i32 vector value -def nxv4i32 : VTScalableVec<4, i32, 159>; // n x 4 x i32 vector value -def nxv8i32 : VTScalableVec<8, i32, 160>; // n x 8 x i32 vector value -def nxv16i32 : VTScalableVec<16, i32, 161>; // n x 16 x i32 vector value -def nxv32i32 : VTScalableVec<32, i32, 162>; // n x 32 x i32 vector value - -def nxv1i64 : VTScalableVec<1, i64, 163>; // n x 1 x i64 vector value -def nxv2i64 : VTScalableVec<2, i64, 164>; // n x 2 x i64 vector value -def nxv4i64 : VTScalableVec<4, i64, 165>; // n x 4 x i64 vector value -def nxv8i64 : VTScalableVec<8, i64, 166>; // n x 8 x i64 vector value -def nxv16i64 : VTScalableVec<16, i64, 167>; // n x 16 x i64 vector value -def nxv32i64 : VTScalableVec<32, i64, 168>; // n x 32 x i64 vector value - -def nxv1f16 : VTScalableVec<1, f16, 169>; // n x 1 x f16 vector value -def nxv2f16 : VTScalableVec<2, f16, 170>; // n x 2 x f16 vector value -def nxv4f16 : VTScalableVec<4, f16, 171>; // n x 4 x f16 vector value -def nxv8f16 : VTScalableVec<8, f16, 172>; // n x 8 x f16 vector value -def nxv16f16 : VTScalableVec<16, f16, 173>; // n x 16 x f16 vector value -def nxv32f16 : VTScalableVec<32, f16, 174>; // n x 32 x f16 vector value - -def nxv1bf16 : VTScalableVec<1, bf16, 175>; // n x 1 x bf16 vector value -def nxv2bf16 : VTScalableVec<2, bf16, 176>; // n x 2 x bf16 vector value -def nxv4bf16 : VTScalableVec<4, bf16, 177>; // n x 4 x bf16 vector value -def nxv8bf16 : VTScalableVec<8, bf16, 178>; // n x 8 x bf16 vector value -def nxv16bf16 : VTScalableVec<16, bf16, 179>; // n x 16 x bf16 vector value -def nxv32bf16 : VTScalableVec<32, bf16, 180>; // n x 32 x bf16 vector value - -def nxv1f32 : VTScalableVec<1, f32, 181>; // n x 1 x f32 vector value -def nxv2f32 : VTScalableVec<2, f32, 182>; // n x 2 x f32 vector value -def nxv4f32 : VTScalableVec<4, f32, 183>; // n x 4 x f32 vector value -def nxv8f32 : VTScalableVec<8, f32, 184>; // n x 8 x f32 vector value -def nxv16f32 : VTScalableVec<16, f32, 185>; // n x 16 x f32 vector value - -def nxv1f64 : VTScalableVec<1, f64, 186>; // n x 1 x f64 vector value -def nxv2f64 : VTScalableVec<2, f64, 187>; // n x 2 x f64 vector value -def nxv4f64 : VTScalableVec<4, f64, 188>; // n x 4 x f64 vector value -def nxv8f64 : VTScalableVec<8, f64, 189>; // n x 8 x f64 vector value +def v1bf16 : VTVec<1, bf16, 99>; // 1 x bf16 vector value +def v2bf16 : VTVec<2, bf16, 100>; // 2 x bf16 vector value +def v3bf16 : VTVec<3, bf16, 101>; // 3 x bf16 vector value +def v4bf16 : VTVec<4, bf16, 102>; // 4 x bf16 vector value +def v8bf16 : VTVec<8, bf16, 103>; // 8 x bf16 vector value +def v16bf16 : VTVec<16, bf16, 104>; // 16 x bf16 vector value +def v32bf16 : VTVec<32, bf16, 105>; // 32 x bf16 vector value +def v64bf16 : VTVec<64, bf16, 106>; // 64 x bf16 vector value +def v128bf16 : VTVec<128, bf16, 107>; // 128 x bf16 vector value + +def v1f32 : VTVec<1, f32, 108>; // 1 x f32 vector value +def v2f32 : VTVec<2, f32, 109>; // 2 x f32 vector value +def v3f32 : VTVec<3, f32, 110>; // 3 x f32 vector value +def v4f32 : VTVec<4, f32, 111>; // 4 x f32 vector value +def v5f32 : VTVec<5, f32, 112>; // 5 x f32 vector value +def v6f32 : VTVec<6, f32, 113>; // 6 x f32 vector value +def v7f32 : VTVec<7, f32, 114>; // 7 x f32 vector value +def v8f32 : VTVec<8, f32, 115>; // 8 x f32 vector value +def v9f32 : VTVec<9, f32, 116>; // 9 x f32 vector value +def v10f32 : VTVec<10, f32, 117>; // 10 x f32 vector value +def v11f32 : VTVec<11, f32, 118>; // 11 x f32 vector value +def v12f32 : VTVec<12, f32, 119>; // 12 x f32 vector value +def v16f32 : VTVec<16, f32, 120>; // 16 x f32 vector value +def v32f32 : VTVec<32, f32, 121>; // 32 x f32 vector value +def v64f32 : VTVec<64, f32, 122>; // 64 x f32 vector value +def v128f32 : VTVec<128, f32, 123>; // 128 x f32 vector value +def v256f32 : VTVec<256, f32, 124>; // 256 x f32 vector value +def v512f32 : VTVec<512, f32, 125>; // 512 x f32 vector value +def v1024f32 : VTVec<1024, f32, 126>; // 1024 x f32 vector value +def v2048f32 : VTVec<2048, f32, 127>; // 2048 x f32 vector value + +def v1f64 : VTVec<1, f64, 128>; // 1 x f64 vector value +def v2f64 : VTVec<2, f64, 129>; // 2 x f64 vector value +def v3f64 : VTVec<3, f64, 130>; // 3 x f64 vector value +def v4f64 : VTVec<4, f64, 131>; // 4 x f64 vector value +def v8f64 : VTVec<8, f64, 132>; // 8 x f64 vector value +def v16f64 : VTVec<16, f64, 133>; // 16 x f64 vector value +def v32f64 : VTVec<32, f64, 134>; // 32 x f64 vector value +def v64f64 : VTVec<64, f64, 135>; // 64 x f64 vector value +def v128f64 : VTVec<128, f64, 136>; // 128 x f64 vector value +def v256f64 : VTVec<256, f64, 137>; // 256 x f64 vector value + +def nxv1i1 : VTScalableVec<1, i1, 138>; // n x 1 x i1 vector value +def nxv2i1 : VTScalableVec<2, i1, 139>; // n x 2 x i1 vector value +def nxv4i1 : VTScalableVec<4, i1, 140>; // n x 4 x i1 vector value +def nxv8i1 : VTScalableVec<8, i1, 141>; // n x 8 x i1 vector value +def nxv16i1 : VTScalableVec<16, i1, 142>; // n x 16 x i1 vector value +def nxv32i1 : VTScalableVec<32, i1, 143>; // n x 32 x i1 vector value +def nxv64i1 : VTScalableVec<64, i1, 144>; // n x 64 x i1 vector value + +def nxv1i8 : VTScalableVec<1, i8, 145>; // n x 1 x i8 vector value +def nxv2i8 : VTScalableVec<2, i8, 146>; // n x 2 x i8 vector value +def nxv4i8 : VTScalableVec<4, i8, 147>; // n x 4 x i8 vector value +def nxv8i8 : VTScalableVec<8, i8, 148>; // n x 8 x i8 vector value +def nxv16i8 : VTScalableVec<16, i8, 149>; // n x 16 x i8 vector value +def nxv32i8 : VTScalableVec<32, i8, 150>; // n x 32 x i8 vector value +def nxv64i8 : VTScalableVec<64, i8, 151>; // n x 64 x i8 vector value + +def nxv1i16 : VTScalableVec<1, i16, 152>; // n x 1 x i16 vector value +def nxv2i16 : VTScalableVec<2, i16, 153>; // n x 2 x i16 vector value +def nxv4i16 : VTScalableVec<4, i16, 154>; // n x 4 x i16 vector value +def nxv8i16 : VTScalableVec<8, i16, 155>; // n x 8 x i16 vector value +def nxv16i16 : VTScalableVec<16, i16, 156>; // n x 16 x i16 vector value +def nxv32i16 : VTScalableVec<32, i16, 157>; // n x 32 x i16 vector value + +def nxv1i32 : VTScalableVec<1, i32, 158>; // n x 1 x i32 vector value +def nxv2i32 : VTScalableVec<2, i32, 159>; // n x 2 x i32 vector value +def nxv4i32 : VTScalableVec<4, i32, 160>; // n x 4 x i32 vector value +def nxv8i32 : VTScalableVec<8, i32, 161>; // n x 8 x i32 vector value +def nxv16i32 : VTScalableVec<16, i32, 162>; // n x 16 x i32 vector value +def nxv32i32 : VTScalableVec<32, i32, 163>; // n x 32 x i32 vector value + +def nxv1i64 : VTScalableVec<1, i64, 164>; // n x 1 x i64 vector value +def nxv2i64 : VTScalableVec<2, i64, 165>; // n x 2 x i64 vector value +def nxv4i64 : VTScalableVec<4, i64, 166>; // n x 4 x i64 vector value +def nxv8i64 : VTScalableVec<8, i64, 167>; // n x 8 x i64 vector value +def nxv16i64 : VTScalableVec<16, i64, 168>; // n x 16 x i64 vector value +def nxv32i64 : VTScalableVec<32, i64, 169>; // n x 32 x i64 vector value + +def nxv1f16 : VTScalableVec<1, f16, 170>; // n x 1 x f16 vector value +def nxv2f16 : VTScalableVec<2, f16, 171>; // n x 2 x f16 vector value +def nxv4f16 : VTScalableVec<4, f16, 172>; // n x 4 x f16 vector value +def nxv8f16 : VTScalableVec<8, f16, 173>; // n x 8 x f16 vector value +def nxv16f16 : VTScalableVec<16, f16, 174>; // n x 16 x f16 vector value +def nxv32f16 : VTScalableVec<32, f16, 175>; // n x 32 x f16 vector value + +def nxv1bf16 : VTScalableVec<1, bf16, 176>; // n x 1 x bf16 vector value +def nxv2bf16 : VTScalableVec<2, bf16, 177>; // n x 2 x bf16 vector value +def nxv4bf16 : VTScalableVec<4, bf16, 178>; // n x 4 x bf16 vector value +def nxv8bf16 : VTScalableVec<8, bf16, 179>; // n x 8 x bf16 vector value +def nxv16bf16 : VTScalableVec<16, bf16, 180>; // n x 16 x bf16 vector value +def nxv32bf16 : VTScalableVec<32, bf16, 181>; // n x 32 x bf16 vector value + +def nxv1f32 : VTScalableVec<1, f32, 182>; // n x 1 x f32 vector value +def nxv2f32 : VTScalableVec<2, f32, 183>; // n x 2 x f32 vector value +def nxv4f32 : VTScalableVec<4, f32, 184>; // n x 4 x f32 vector value +def nxv8f32 : VTScalableVec<8, f32, 185>; // n x 8 x f32 vector value +def nxv16f32 : VTScalableVec<16, f32, 186>; // n x 16 x f32 vector value + +def nxv1f64 : VTScalableVec<1, f64, 187>; // n x 1 x f64 vector value +def nxv2f64 : VTScalableVec<2, f64, 188>; // n x 2 x f64 vector value +def nxv4f64 : VTScalableVec<4, f64, 189>; // n x 4 x f64 vector value +def nxv8f64 : VTScalableVec<8, f64, 190>; // n x 8 x f64 vector value // Sz = NF * MinNumElts * 8(bits) -def riscv_nxv1i8x2 : VTVecTup<16, 2, i8, 190>; // RISCV vector tuple(min_num_elts=1, nf=2) -def riscv_nxv1i8x3 : VTVecTup<24, 3, i8, 191>; // RISCV vector tuple(min_num_elts=1, nf=3) -def riscv_nxv1i8x4 : VTVecTup<32, 4, i8, 192>; // RISCV vector tuple(min_num_elts=1, nf=4) -def riscv_nxv1i8x5 : VTVecTup<40, 5, i8, 193>; // RISCV vector tuple(min_num_elts=1, nf=5) -def riscv_nxv1i8x6 : VTVecTup<48, 6, i8, 194>; // RISCV vector tuple(min_num_elts=1, nf=6) -def riscv_nxv1i8x7 : VTVecTup<56, 7, i8, 195>; // RISCV vector tuple(min_num_elts=1, nf=7) -def riscv_nxv1i8x8 : VTVecTup<64, 8, i8, 196>; // RISCV vector tuple(min_num_elts=1, nf=8) -def riscv_nxv2i8x2 : VTVecTup<32, 2, i8, 197>; // RISCV vector tuple(min_num_elts=2, nf=2) -def riscv_nxv2i8x3 : VTVecTup<48, 3, i8, 198>; // RISCV vector tuple(min_num_elts=2, nf=3) -def riscv_nxv2i8x4 : VTVecTup<64, 4, i8, 199>; // RISCV vector tuple(min_num_elts=2, nf=4) -def riscv_nxv2i8x5 : VTVecTup<80, 5, i8, 200>; // RISCV vector tuple(min_num_elts=2, nf=5) -def riscv_nxv2i8x6 : VTVecTup<96, 6, i8, 201>; // RISCV vector tuple(min_num_elts=2, nf=6) -def riscv_nxv2i8x7 : VTVecTup<112, 7, i8, 202>; // RISCV vector tuple(min_num_elts=2, nf=7) -def riscv_nxv2i8x8 : VTVecTup<128, 8, i8, 203>; // RISCV vector tuple(min_num_elts=2, nf=8) -def riscv_nxv4i8x2 : VTVecTup<64, 2, i8, 204>; // RISCV vector tuple(min_num_elts=4, nf=2) -def riscv_nxv4i8x3 : VTVecTup<96, 3, i8, 205>; // RISCV vector tuple(min_num_elts=4, nf=3) -def riscv_nxv4i8x4 : VTVecTup<128, 4, i8, 206>; // RISCV vector tuple(min_num_elts=4, nf=4) -def riscv_nxv4i8x5 : VTVecTup<160, 5, i8, 207>; // RISCV vector tuple(min_num_elts=4, nf=5) -def riscv_nxv4i8x6 : VTVecTup<192, 6, i8, 208>; // RISCV vector tuple(min_num_elts=4, nf=6) -def riscv_nxv4i8x7 : VTVecTup<224, 7, i8, 209>; // RISCV vector tuple(min_num_elts=4, nf=7) -def riscv_nxv4i8x8 : VTVecTup<256, 8, i8, 210>; // RISCV vector tuple(min_num_elts=4, nf=8) -def riscv_nxv8i8x2 : VTVecTup<128, 2, i8, 211>; // RISCV vector tuple(min_num_elts=8, nf=2) -def riscv_nxv8i8x3 : VTVecTup<192, 3, i8, 212>; // RISCV vector tuple(min_num_elts=8, nf=3) -def riscv_nxv8i8x4 : VTVecTup<256, 4, i8, 213>; // RISCV vector tuple(min_num_elts=8, nf=4) -def riscv_nxv8i8x5 : VTVecTup<320, 5, i8, 214>; // RISCV vector tuple(min_num_elts=8, nf=5) -def riscv_nxv8i8x6 : VTVecTup<384, 6, i8, 215>; // RISCV vector tuple(min_num_elts=8, nf=6) -def riscv_nxv8i8x7 : VTVecTup<448, 7, i8, 216>; // RISCV vector tuple(min_num_elts=8, nf=7) -def riscv_nxv8i8x8 : VTVecTup<512, 8, i8, 217>; // RISCV vector tuple(min_num_elts=8, nf=8) -def riscv_nxv16i8x2 : VTVecTup<256, 2, i8, 218>; // RISCV vector tuple(min_num_elts=16, nf=2) -def riscv_nxv16i8x3 : VTVecTup<384, 3, i8, 219>; // RISCV vector tuple(min_num_elts=16, nf=3) -def riscv_nxv16i8x4 : VTVecTup<512, 4, i8, 220>; // RISCV vector tuple(min_num_elts=16, nf=4) -def riscv_nxv32i8x2 : VTVecTup<512, 2, i8, 221>; // RISCV vector tuple(min_num_elts=32, nf=2) - -def x86mmx : ValueType<64, 222>; // X86 MMX value -def Glue : ValueType<0, 223>; // Pre-RA sched glue -def isVoid : ValueType<0, 224>; // Produces no value -def untyped : ValueType<8, 225> { // Produces an untyped value +def riscv_nxv1i8x2 : VTVecTup<16, 2, i8, 191>; // RISCV vector tuple(min_num_elts=1, nf=2) +def riscv_nxv1i8x3 : VTVecTup<24, 3, i8, 192>; // RISCV vector tuple(min_num_elts=1, nf=3) +def riscv_nxv1i8x4 : VTVecTup<32, 4, i8, 193>; // RISCV vector tuple(min_num_elts=1, nf=4) +def riscv_nxv1i8x5 : VTVecTup<40, 5, i8, 194>; // RISCV vector tuple(min_num_elts=1, nf=5) +def riscv_nxv1i8x6 : VTVecTup<48, 6, i8, 195>; // RISCV vector tuple(min_num_elts=1, nf=6) +def riscv_nxv1i8x7 : VTVecTup<56, 7, i8, 196>; // RISCV vector tuple(min_num_elts=1, nf=7) +def riscv_nxv1i8x8 : VTVecTup<64, 8, i8, 197>; // RISCV vector tuple(min_num_elts=1, nf=8) +def riscv_nxv2i8x2 : VTVecTup<32, 2, i8, 198>; // RISCV vector tuple(min_num_elts=2, nf=2) +def riscv_nxv2i8x3 : VTVecTup<48, 3, i8, 199>; // RISCV vector tuple(min_num_elts=2, nf=3) +def riscv_nxv2i8x4 : VTVecTup<64, 4, i8, 200>; // RISCV vector tuple(min_num_elts=2, nf=4) +def riscv_nxv2i8x5 : VTVecTup<80, 5, i8, 201>; // RISCV vector tuple(min_num_elts=2, nf=5) +def riscv_nxv2i8x6 : VTVecTup<96, 6, i8, 202>; // RISCV vector tuple(min_num_elts=2, nf=6) +def riscv_nxv2i8x7 : VTVecTup<112, 7, i8, 203>; // RISCV vector tuple(min_num_elts=2, nf=7) +def riscv_nxv2i8x8 : VTVecTup<128, 8, i8, 204>; // RISCV vector tuple(min_num_elts=2, nf=8) +def riscv_nxv4i8x2 : VTVecTup<64, 2, i8, 205>; // RISCV vector tuple(min_num_elts=4, nf=2) +def riscv_nxv4i8x3 : VTVecTup<96, 3, i8, 206>; // RISCV vector tuple(min_num_elts=4, nf=3) +def riscv_nxv4i8x4 : VTVecTup<128, 4, i8, 207>; // RISCV vector tuple(min_num_elts=4, nf=4) +def riscv_nxv4i8x5 : VTVecTup<160, 5, i8, 208>; // RISCV vector tuple(min_num_elts=4, nf=5) +def riscv_nxv4i8x6 : VTVecTup<192, 6, i8, 209>; // RISCV vector tuple(min_num_elts=4, nf=6) +def riscv_nxv4i8x7 : VTVecTup<224, 7, i8, 210>; // RISCV vector tuple(min_num_elts=4, nf=7) +def riscv_nxv4i8x8 : VTVecTup<256, 8, i8, 211>; // RISCV vector tuple(min_num_elts=4, nf=8) +def riscv_nxv8i8x2 : VTVecTup<128, 2, i8, 212>; // RISCV vector tuple(min_num_elts=8, nf=2) +def riscv_nxv8i8x3 : VTVecTup<192, 3, i8, 213>; // RISCV vector tuple(min_num_elts=8, nf=3) +def riscv_nxv8i8x4 : VTVecTup<256, 4, i8, 214>; // RISCV vector tuple(min_num_elts=8, nf=4) +def riscv_nxv8i8x5 : VTVecTup<320, 5, i8, 215>; // RISCV vector tuple(min_num_elts=8, nf=5) +def riscv_nxv8i8x6 : VTVecTup<384, 6, i8, 216>; // RISCV vector tuple(min_num_elts=8, nf=6) +def riscv_nxv8i8x7 : VTVecTup<448, 7, i8, 217>; // RISCV vector tuple(min_num_elts=8, nf=7) +def riscv_nxv8i8x8 : VTVecTup<512, 8, i8, 218>; // RISCV vector tuple(min_num_elts=8, nf=8) +def riscv_nxv16i8x2 : VTVecTup<256, 2, i8, 219>; // RISCV vector tuple(min_num_elts=16, nf=2) +def riscv_nxv16i8x3 : VTVecTup<384, 3, i8, 220>; // RISCV vector tuple(min_num_elts=16, nf=3) +def riscv_nxv16i8x4 : VTVecTup<512, 4, i8, 221>; // RISCV vector tuple(min_num_elts=16, nf=4) +def riscv_nxv32i8x2 : VTVecTup<512, 2, i8, 222>; // RISCV vector tuple(min_num_elts=32, nf=2) + +def x86mmx : ValueType<64, 223>; // X86 MMX value +def Glue : ValueType<0, 224>; // Pre-RA sched glue +def isVoid : ValueType<0, 225>; // Produces no value +def untyped : ValueType<8, 226> { // Produces an untyped value let LLVMName = "Untyped"; } -def funcref : ValueType<0, 226>; // WebAssembly's funcref type -def externref : ValueType<0, 227>; // WebAssembly's externref type -def exnref : ValueType<0, 228>; // WebAssembly's exnref type -def x86amx : ValueType<8192, 229>; // X86 AMX value -def i64x8 : ValueType<512, 230>; // 8 Consecutive GPRs (AArch64) +def funcref : ValueType<0, 227>; // WebAssembly's funcref type +def externref : ValueType<0, 228>; // WebAssembly's externref type +def exnref : ValueType<0, 229>; // WebAssembly's exnref type +def x86amx : ValueType<8192, 230>; // X86 AMX value +def i64x8 : ValueType<512, 231>; // 8 Consecutive GPRs (AArch64) def aarch64svcount - : ValueType<16, 231>; // AArch64 predicate-as-counter -def spirvbuiltin : ValueType<0, 232>; // SPIR-V's builtin type + : ValueType<16, 232>; // AArch64 predicate-as-counter +def spirvbuiltin : ValueType<0, 233>; // SPIR-V's builtin type let isNormalValueType = false in { def token : ValueType<0, 504>; // TokenTy diff --git a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll index b3e66ccc705f8..35619db0b4990 100644 --- a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll +++ b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll @@ -521,7 +521,7 @@ define void @frem() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %BF16 = frem bfloat undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1BF16 = frem <1 x bfloat> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1BF16 = frem <1 x bfloat> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2BF16 = frem <2 x bfloat> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4BF16 = frem <4 x bfloat> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8BF16 = frem <8 x bfloat> undef, undef @@ -761,7 +761,7 @@ define void @fcopysign() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %BF16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.copysign.f32(float undef, float undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.copysign.f64(double undef, double undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1BF16 = call <1 x bfloat> @llvm.copysign.v1bf16(<1 x bfloat> undef, <1 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V1BF16 = call <1 x bfloat> @llvm.copysign.v1bf16(<1 x bfloat> undef, <1 x bfloat> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2BF16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4BF16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8BF16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef) @@ -889,7 +889,7 @@ define void @fma() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %BF16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1BF16 = call <1 x bfloat> @llvm.fma.v1bf16(<1 x bfloat> undef, <1 x bfloat> undef, <1 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V1BF16 = call <1 x bfloat> @llvm.fma.v1bf16(<1 x bfloat> undef, <1 x bfloat> undef, <1 x bfloat> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2BF16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4BF16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8BF16 = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef, <8 x bfloat> undef) diff --git a/llvm/test/Analysis/CostModel/RISCV/reduce-fadd.ll b/llvm/test/Analysis/CostModel/RISCV/reduce-fadd.ll index afb2b64464521..588d852d7f26e 100644 --- a/llvm/test/Analysis/CostModel/RISCV/reduce-fadd.ll +++ b/llvm/test/Analysis/CostModel/RISCV/reduce-fadd.ll @@ -1,7 +1,41 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -passes="print" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefixes=FP-REDUCE,FP-REDUCE-ZVFH -; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfhmin -passes="print" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefixes=FP-REDUCE,FP-REDUCE-ZVFHMIN -; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -passes="print" -cost-kind=code-size 2>&1 -disable-output | FileCheck %s --check-prefix=SIZE +; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+zfbfmin,+zvfbfmin -passes="print" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefixes=FP-REDUCE,FP-REDUCE-ZVFH +; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin -passes="print" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefixes=FP-REDUCE,FP-REDUCE-ZVFHMIN +; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+zfbfmin,+zvfbfmin -passes="print" -cost-kind=code-size 2>&1 -disable-output | FileCheck %s --check-prefix=SIZE + +define void @reduce_fadd_bfloat() { +; FP-REDUCE-LABEL: 'reduce_fadd_bfloat' +; FP-REDUCE-NEXT: Cost Model: Invalid cost for instruction: %V1 = call fast bfloat @llvm.vector.reduce.fadd.v1bf16(bfloat 0xR0000, <1 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Invalid cost for instruction: %V2 = call fast bfloat @llvm.vector.reduce.fadd.v2bf16(bfloat 0xR0000, <2 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Invalid cost for instruction: %V4 = call fast bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Invalid cost for instruction: %V8 = call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR0000, <8 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Invalid cost for instruction: %V16 = call fast bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR0000, <16 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Invalid cost for instruction: %v32 = call fast bfloat @llvm.vector.reduce.fadd.v32bf16(bfloat 0xR0000, <32 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Invalid cost for instruction: %V64 = call fast bfloat @llvm.vector.reduce.fadd.v64bf16(bfloat 0xR0000, <64 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Invalid cost for instruction: %V128 = call fast bfloat @llvm.vector.reduce.fadd.v128bf16(bfloat 0xR0000, <128 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SIZE-LABEL: 'reduce_fadd_bfloat' +; SIZE-NEXT: Cost Model: Invalid cost for instruction: %V1 = call fast bfloat @llvm.vector.reduce.fadd.v1bf16(bfloat 0xR0000, <1 x bfloat> undef) +; SIZE-NEXT: Cost Model: Invalid cost for instruction: %V2 = call fast bfloat @llvm.vector.reduce.fadd.v2bf16(bfloat 0xR0000, <2 x bfloat> undef) +; SIZE-NEXT: Cost Model: Invalid cost for instruction: %V4 = call fast bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef) +; SIZE-NEXT: Cost Model: Invalid cost for instruction: %V8 = call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR0000, <8 x bfloat> undef) +; SIZE-NEXT: Cost Model: Invalid cost for instruction: %V16 = call fast bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR0000, <16 x bfloat> undef) +; SIZE-NEXT: Cost Model: Invalid cost for instruction: %v32 = call fast bfloat @llvm.vector.reduce.fadd.v32bf16(bfloat 0xR0000, <32 x bfloat> undef) +; SIZE-NEXT: Cost Model: Invalid cost for instruction: %V64 = call fast bfloat @llvm.vector.reduce.fadd.v64bf16(bfloat 0xR0000, <64 x bfloat> undef) +; SIZE-NEXT: Cost Model: Invalid cost for instruction: %V128 = call fast bfloat @llvm.vector.reduce.fadd.v128bf16(bfloat 0xR0000, <128 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %V1 = call fast bfloat @llvm.vector.reduce.fadd.v1bf16(bfloat 0.0, <1 x bfloat> undef) + %V2 = call fast bfloat @llvm.vector.reduce.fadd.v2bf16(bfloat 0.0, <2 x bfloat> undef) + %V4 = call fast bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0.0, <4 x bfloat> undef) + %V8 = call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0.0, <8 x bfloat> undef) + %V16 = call fast bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0.0, <16 x bfloat> undef) + %v32 = call fast bfloat @llvm.vector.reduce.fadd.v32bf16(bfloat 0.0, <32 x bfloat> undef) + %V64 = call fast bfloat @llvm.vector.reduce.fadd.v64bf16(bfloat 0.0, <64 x bfloat> undef) + %V128 = call fast bfloat @llvm.vector.reduce.fadd.v128bf16(bfloat 0.0, <128 x bfloat> undef) + ret void +} define void @reduce_fadd_half() { ; FP-REDUCE-ZVFH-LABEL: 'reduce_fadd_half' @@ -116,6 +150,40 @@ define void @reduce_fadd_double() { ret void } +define void @reduce_oredered_fadd_bfloat() { +; FP-REDUCE-LABEL: 'reduce_oredered_fadd_bfloat' +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call bfloat @llvm.vector.reduce.fadd.v1bf16(bfloat 0xR0000, <1 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call bfloat @llvm.vector.reduce.fadd.v2bf16(bfloat 0xR0000, <2 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR0000, <8 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16 = call bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR0000, <16 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v32 = call bfloat @llvm.vector.reduce.fadd.v32bf16(bfloat 0xR0000, <32 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V64 = call bfloat @llvm.vector.reduce.fadd.v64bf16(bfloat 0xR0000, <64 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %V128 = call bfloat @llvm.vector.reduce.fadd.v128bf16(bfloat 0xR0000, <128 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SIZE-LABEL: 'reduce_oredered_fadd_bfloat' +; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call bfloat @llvm.vector.reduce.fadd.v1bf16(bfloat 0xR0000, <1 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call bfloat @llvm.vector.reduce.fadd.v2bf16(bfloat 0xR0000, <2 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8 = call bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR0000, <8 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR0000, <16 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v32 = call bfloat @llvm.vector.reduce.fadd.v32bf16(bfloat 0xR0000, <32 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = call bfloat @llvm.vector.reduce.fadd.v64bf16(bfloat 0xR0000, <64 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V128 = call bfloat @llvm.vector.reduce.fadd.v128bf16(bfloat 0xR0000, <128 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %V1 = call bfloat @llvm.vector.reduce.fadd.v1bf16(bfloat 0.0, <1 x bfloat> undef) + %V2 = call bfloat @llvm.vector.reduce.fadd.v2bf16(bfloat 0.0, <2 x bfloat> undef) + %V4 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0.0, <4 x bfloat> undef) + %V8 = call bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0.0, <8 x bfloat> undef) + %V16 = call bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0.0, <16 x bfloat> undef) + %v32 = call bfloat @llvm.vector.reduce.fadd.v32bf16(bfloat 0.0, <32 x bfloat> undef) + %V64 = call bfloat @llvm.vector.reduce.fadd.v64bf16(bfloat 0.0, <64 x bfloat> undef) + %V128 = call bfloat @llvm.vector.reduce.fadd.v128bf16(bfloat 0.0, <128 x bfloat> undef) + ret void +} + define void @reduce_oredered_fadd_half() { ; FP-REDUCE-LABEL: 'reduce_oredered_fadd_half' ; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call half @llvm.vector.reduce.fadd.v1f16(half 0xH0000, <1 x half> undef) diff --git a/llvm/test/Analysis/CostModel/RISCV/reduce-fmul.ll b/llvm/test/Analysis/CostModel/RISCV/reduce-fmul.ll index da1336aa724c9..913ce40f133da 100644 --- a/llvm/test/Analysis/CostModel/RISCV/reduce-fmul.ll +++ b/llvm/test/Analysis/CostModel/RISCV/reduce-fmul.ll @@ -1,7 +1,41 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -passes="print" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefixes=FP-REDUCE,FP-REDUCE-ZVFH -; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfhmin -passes="print" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefixes=FP-REDUCE,FP-REDUCE-ZVFHMIN -; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -passes="print" -cost-kind=code-size 2>&1 -disable-output | FileCheck %s --check-prefix=SIZE +; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+zfbfmin,+zvfbfmin -passes="print" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefixes=FP-REDUCE,FP-REDUCE-ZVFH +; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin -passes="print" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefixes=FP-REDUCE,FP-REDUCE-ZVFHMIN +; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+zfbfmin,+zvfbfmin -passes="print" -cost-kind=code-size 2>&1 -disable-output | FileCheck %s --check-prefix=SIZE + +define void @reduce_fmul_bfloat() { +; FP-REDUCE-LABEL: 'reduce_fmul_bfloat' +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call fast bfloat @llvm.vector.reduce.fmul.v1bf16(bfloat 0xR0000, <1 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call fast bfloat @llvm.vector.reduce.fmul.v2bf16(bfloat 0xR0000, <2 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call fast bfloat @llvm.vector.reduce.fmul.v4bf16(bfloat 0xR0000, <4 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call fast bfloat @llvm.vector.reduce.fmul.v8bf16(bfloat 0xR0000, <8 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16 = call fast bfloat @llvm.vector.reduce.fmul.v16bf16(bfloat 0xR0000, <16 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 151 for instruction: %v32 = call fast bfloat @llvm.vector.reduce.fmul.v32bf16(bfloat 0xR0000, <32 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 541 for instruction: %V64 = call fast bfloat @llvm.vector.reduce.fmul.v64bf16(bfloat 0xR0000, <64 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 573 for instruction: %V128 = call fast bfloat @llvm.vector.reduce.fmul.v128bf16(bfloat 0xR0000, <128 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SIZE-LABEL: 'reduce_fmul_bfloat' +; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call fast bfloat @llvm.vector.reduce.fmul.v1bf16(bfloat 0xR0000, <1 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call fast bfloat @llvm.vector.reduce.fmul.v2bf16(bfloat 0xR0000, <2 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call fast bfloat @llvm.vector.reduce.fmul.v4bf16(bfloat 0xR0000, <4 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8 = call fast bfloat @llvm.vector.reduce.fmul.v8bf16(bfloat 0xR0000, <8 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call fast bfloat @llvm.vector.reduce.fmul.v16bf16(bfloat 0xR0000, <16 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %v32 = call fast bfloat @llvm.vector.reduce.fmul.v32bf16(bfloat 0xR0000, <32 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V64 = call fast bfloat @llvm.vector.reduce.fmul.v64bf16(bfloat 0xR0000, <64 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V128 = call fast bfloat @llvm.vector.reduce.fmul.v128bf16(bfloat 0xR0000, <128 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %V1 = call fast bfloat @llvm.vector.reduce.fmul.v1bf16(bfloat 0.0, <1 x bfloat> undef) + %V2 = call fast bfloat @llvm.vector.reduce.fmul.v2bf16(bfloat 0.0, <2 x bfloat> undef) + %V4 = call fast bfloat @llvm.vector.reduce.fmul.v4bf16(bfloat 0.0, <4 x bfloat> undef) + %V8 = call fast bfloat @llvm.vector.reduce.fmul.v8bf16(bfloat 0.0, <8 x bfloat> undef) + %V16 = call fast bfloat @llvm.vector.reduce.fmul.v16bf16(bfloat 0.0, <16 x bfloat> undef) + %v32 = call fast bfloat @llvm.vector.reduce.fmul.v32bf16(bfloat 0.0, <32 x bfloat> undef) + %V64 = call fast bfloat @llvm.vector.reduce.fmul.v64bf16(bfloat 0.0, <64 x bfloat> undef) + %V128 = call fast bfloat @llvm.vector.reduce.fmul.v128bf16(bfloat 0.0, <128 x bfloat> undef) + ret void +} define void @reduce_fmul_half() { ; FP-REDUCE-ZVFH-LABEL: 'reduce_fmul_half' @@ -116,6 +150,40 @@ define void @reduce_fmul_double() { ret void } +define void @reduce_ordered_fmul_bfloat() { +; FP-REDUCE-LABEL: 'reduce_ordered_fmul_bfloat' +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call bfloat @llvm.vector.reduce.fmul.v1bf16(bfloat 0xR0000, <1 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call bfloat @llvm.vector.reduce.fmul.v2bf16(bfloat 0xR0000, <2 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4 = call bfloat @llvm.vector.reduce.fmul.v4bf16(bfloat 0xR0000, <4 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V8 = call bfloat @llvm.vector.reduce.fmul.v8bf16(bfloat 0xR0000, <8 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V16 = call bfloat @llvm.vector.reduce.fmul.v16bf16(bfloat 0xR0000, <16 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 127 for instruction: %v32 = call bfloat @llvm.vector.reduce.fmul.v32bf16(bfloat 0xR0000, <32 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 255 for instruction: %V64 = call bfloat @llvm.vector.reduce.fmul.v64bf16(bfloat 0xR0000, <64 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 510 for instruction: %V128 = call bfloat @llvm.vector.reduce.fmul.v128bf16(bfloat 0xR0000, <128 x bfloat> undef) +; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; SIZE-LABEL: 'reduce_ordered_fmul_bfloat' +; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V1 = call bfloat @llvm.vector.reduce.fmul.v1bf16(bfloat 0xR0000, <1 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call bfloat @llvm.vector.reduce.fmul.v2bf16(bfloat 0xR0000, <2 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call bfloat @llvm.vector.reduce.fmul.v4bf16(bfloat 0xR0000, <4 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V8 = call bfloat @llvm.vector.reduce.fmul.v8bf16(bfloat 0xR0000, <8 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V16 = call bfloat @llvm.vector.reduce.fmul.v16bf16(bfloat 0xR0000, <16 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %v32 = call bfloat @llvm.vector.reduce.fmul.v32bf16(bfloat 0xR0000, <32 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 191 for instruction: %V64 = call bfloat @llvm.vector.reduce.fmul.v64bf16(bfloat 0xR0000, <64 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 382 for instruction: %V128 = call bfloat @llvm.vector.reduce.fmul.v128bf16(bfloat 0xR0000, <128 x bfloat> undef) +; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %V1 = call bfloat @llvm.vector.reduce.fmul.v1bf16(bfloat 0.0, <1 x bfloat> undef) + %V2 = call bfloat @llvm.vector.reduce.fmul.v2bf16(bfloat 0.0, <2 x bfloat> undef) + %V4 = call bfloat @llvm.vector.reduce.fmul.v4bf16(bfloat 0.0, <4 x bfloat> undef) + %V8 = call bfloat @llvm.vector.reduce.fmul.v8bf16(bfloat 0.0, <8 x bfloat> undef) + %V16 = call bfloat @llvm.vector.reduce.fmul.v16bf16(bfloat 0.0, <16 x bfloat> undef) + %v32 = call bfloat @llvm.vector.reduce.fmul.v32bf16(bfloat 0.0, <32 x bfloat> undef) + %V64 = call bfloat @llvm.vector.reduce.fmul.v64bf16(bfloat 0.0, <64 x bfloat> undef) + %V128 = call bfloat @llvm.vector.reduce.fmul.v128bf16(bfloat 0.0, <128 x bfloat> undef) + ret void +} + define void @reduce_ordered_fmul_half() { ; FP-REDUCE-LABEL: 'reduce_ordered_fmul_half' ; FP-REDUCE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call half @llvm.vector.reduce.fmul.v1f16(half 0xH0000, <1 x half> undef) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmaccbf16.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmaccbf16.ll index b953cf1f5bed8..1639f21f243d8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmaccbf16.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmaccbf16.ll @@ -1,112 +1,24 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfbfwma -verify-machineinstrs | FileCheck %s --check-prefix=ZVFBFWMA ; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfbfwma -verify-machineinstrs | FileCheck %s --check-prefix=ZVFBFWMA -; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfbfmin -verify-machineinstrs | FileCheck %s --check-prefixes=ZVFBFMIN,ZVFBMIN32 -; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfbfmin -verify-machineinstrs | FileCheck %s --check-prefixes=ZVFBFMIN,ZVFBMIN64 +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfbfmin -verify-machineinstrs | FileCheck %s --check-prefix=ZVFBFMIN +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfbfmin -verify-machineinstrs | FileCheck %s --check-prefix=ZVFBFMIN define <1 x float> @vfwmaccbf16_vv_v1f32(<1 x float> %a, <1 x bfloat> %b, <1 x bfloat> %c) { ; ZVFBFWMA-LABEL: vfwmaccbf16_vv_v1f32: ; ZVFBFWMA: # %bb.0: -; ZVFBFWMA-NEXT: addi sp, sp, -16 -; ZVFBFWMA-NEXT: .cfi_def_cfa_offset 16 -; ZVFBFWMA-NEXT: fcvt.s.bf16 fa5, fa0 -; ZVFBFWMA-NEXT: fsw fa5, 8(sp) -; ZVFBFWMA-NEXT: addi a0, sp, 8 -; ZVFBFWMA-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFBFWMA-NEXT: vle32.v v9, (a0) -; ZVFBFWMA-NEXT: fcvt.s.bf16 fa5, fa1 -; ZVFBFWMA-NEXT: fsw fa5, 12(sp) -; ZVFBFWMA-NEXT: addi a0, sp, 12 -; ZVFBFWMA-NEXT: vle32.v v10, (a0) -; ZVFBFWMA-NEXT: vfmacc.vv v8, v9, v10 -; ZVFBFWMA-NEXT: addi sp, sp, 16 +; ZVFBFWMA-NEXT: vsetivli zero, 1, e16, mf4, ta, ma +; ZVFBFWMA-NEXT: vfwmaccbf16.vv v8, v9, v10 ; ZVFBFWMA-NEXT: ret ; -; ZVFBMIN32-LABEL: vfwmaccbf16_vv_v1f32: -; ZVFBMIN32: # %bb.0: -; ZVFBMIN32-NEXT: addi sp, sp, -32 -; ZVFBMIN32-NEXT: .cfi_def_cfa_offset 32 -; ZVFBMIN32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill -; ZVFBMIN32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill -; ZVFBMIN32-NEXT: fsd fs0, 16(sp) # 8-byte Folded Spill -; ZVFBMIN32-NEXT: .cfi_offset ra, -4 -; ZVFBMIN32-NEXT: .cfi_offset s0, -8 -; ZVFBMIN32-NEXT: .cfi_offset fs0, -16 -; ZVFBMIN32-NEXT: csrr a0, vlenb -; ZVFBMIN32-NEXT: sub sp, sp, a0 -; ZVFBMIN32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 1 * vlenb -; ZVFBMIN32-NEXT: fmv.s fs0, fa0 -; ZVFBMIN32-NEXT: addi a0, sp, 16 -; ZVFBMIN32-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill -; ZVFBMIN32-NEXT: fmv.s fa0, fa1 -; ZVFBMIN32-NEXT: call __truncsfbf2 -; ZVFBMIN32-NEXT: fmv.x.w s0, fa0 -; ZVFBMIN32-NEXT: fmv.s fa0, fs0 -; ZVFBMIN32-NEXT: call __truncsfbf2 -; ZVFBMIN32-NEXT: fmv.x.w a0, fa0 -; ZVFBMIN32-NEXT: slli a0, a0, 16 -; ZVFBMIN32-NEXT: sw a0, 8(sp) -; ZVFBMIN32-NEXT: addi a0, sp, 8 -; ZVFBMIN32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFBMIN32-NEXT: vle32.v v10, (a0) -; ZVFBMIN32-NEXT: slli s0, s0, 16 -; ZVFBMIN32-NEXT: sw s0, 12(sp) -; ZVFBMIN32-NEXT: addi a0, sp, 12 -; ZVFBMIN32-NEXT: vle32.v v9, (a0) -; ZVFBMIN32-NEXT: addi a0, sp, 16 -; ZVFBMIN32-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload -; ZVFBMIN32-NEXT: vfmacc.vv v8, v10, v9 -; ZVFBMIN32-NEXT: csrr a0, vlenb -; ZVFBMIN32-NEXT: add sp, sp, a0 -; ZVFBMIN32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload -; ZVFBMIN32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload -; ZVFBMIN32-NEXT: fld fs0, 16(sp) # 8-byte Folded Reload -; ZVFBMIN32-NEXT: addi sp, sp, 32 -; ZVFBMIN32-NEXT: ret -; -; ZVFBMIN64-LABEL: vfwmaccbf16_vv_v1f32: -; ZVFBMIN64: # %bb.0: -; ZVFBMIN64-NEXT: addi sp, sp, -64 -; ZVFBMIN64-NEXT: .cfi_def_cfa_offset 64 -; ZVFBMIN64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; ZVFBMIN64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill -; ZVFBMIN64-NEXT: fsd fs0, 40(sp) # 8-byte Folded Spill -; ZVFBMIN64-NEXT: .cfi_offset ra, -8 -; ZVFBMIN64-NEXT: .cfi_offset s0, -16 -; ZVFBMIN64-NEXT: .cfi_offset fs0, -24 -; ZVFBMIN64-NEXT: csrr a0, vlenb -; ZVFBMIN64-NEXT: sub sp, sp, a0 -; ZVFBMIN64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 1 * vlenb -; ZVFBMIN64-NEXT: fmv.s fs0, fa0 -; ZVFBMIN64-NEXT: addi a0, sp, 32 -; ZVFBMIN64-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill -; ZVFBMIN64-NEXT: fmv.s fa0, fa1 -; ZVFBMIN64-NEXT: call __truncsfbf2 -; ZVFBMIN64-NEXT: fmv.x.w s0, fa0 -; ZVFBMIN64-NEXT: fmv.s fa0, fs0 -; ZVFBMIN64-NEXT: call __truncsfbf2 -; ZVFBMIN64-NEXT: fmv.x.w a0, fa0 -; ZVFBMIN64-NEXT: slli a0, a0, 16 -; ZVFBMIN64-NEXT: fmv.w.x fa5, a0 -; ZVFBMIN64-NEXT: fsw fa5, 16(sp) -; ZVFBMIN64-NEXT: addi a0, sp, 16 -; ZVFBMIN64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFBMIN64-NEXT: vle32.v v10, (a0) -; ZVFBMIN64-NEXT: slli s0, s0, 16 -; ZVFBMIN64-NEXT: fmv.w.x fa5, s0 -; ZVFBMIN64-NEXT: fsw fa5, 20(sp) -; ZVFBMIN64-NEXT: addi a0, sp, 20 -; ZVFBMIN64-NEXT: vle32.v v9, (a0) -; ZVFBMIN64-NEXT: addi a0, sp, 32 -; ZVFBMIN64-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload -; ZVFBMIN64-NEXT: vfmacc.vv v8, v10, v9 -; ZVFBMIN64-NEXT: csrr a0, vlenb -; ZVFBMIN64-NEXT: add sp, sp, a0 -; ZVFBMIN64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; ZVFBMIN64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; ZVFBMIN64-NEXT: fld fs0, 40(sp) # 8-byte Folded Reload -; ZVFBMIN64-NEXT: addi sp, sp, 64 -; ZVFBMIN64-NEXT: ret +; ZVFBFMIN-LABEL: vfwmaccbf16_vv_v1f32: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: vsetivli zero, 1, e16, mf4, ta, ma +; ZVFBFMIN-NEXT: vfwcvtbf16.f.f.v v11, v9 +; ZVFBFMIN-NEXT: vfwcvtbf16.f.f.v v9, v10 +; ZVFBFMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFBFMIN-NEXT: vfmacc.vv v8, v11, v9 +; ZVFBFMIN-NEXT: ret %b.ext = fpext <1 x bfloat> %b to <1 x float> %c.ext = fpext <1 x bfloat> %c to <1 x float> %res = call <1 x float> @llvm.fma.v1f32(<1 x float> %b.ext, <1 x float> %c.ext, <1 x float> %a) @@ -116,96 +28,22 @@ define <1 x float> @vfwmaccbf16_vv_v1f32(<1 x float> %a, <1 x bfloat> %b, <1 x b define <1 x float> @vfwmaccbf16_vf_v1f32(<1 x float> %a, bfloat %b, <1 x bfloat> %c) { ; ZVFBFWMA-LABEL: vfwmaccbf16_vf_v1f32: ; ZVFBFWMA: # %bb.0: -; ZVFBFWMA-NEXT: addi sp, sp, -16 -; ZVFBFWMA-NEXT: .cfi_def_cfa_offset 16 -; ZVFBFWMA-NEXT: fcvt.s.bf16 fa5, fa0 -; ZVFBFWMA-NEXT: fsw fa5, 8(sp) -; ZVFBFWMA-NEXT: addi a0, sp, 8 -; ZVFBFWMA-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFBFWMA-NEXT: vle32.v v9, (a0) -; ZVFBFWMA-NEXT: fcvt.s.bf16 fa5, fa1 -; ZVFBFWMA-NEXT: fsw fa5, 12(sp) -; ZVFBFWMA-NEXT: addi a0, sp, 12 -; ZVFBFWMA-NEXT: vle32.v v10, (a0) -; ZVFBFWMA-NEXT: vfmacc.vv v8, v9, v10 -; ZVFBFWMA-NEXT: addi sp, sp, 16 +; ZVFBFWMA-NEXT: fmv.x.h a0, fa0 +; ZVFBFWMA-NEXT: vsetivli zero, 1, e16, mf4, ta, ma +; ZVFBFWMA-NEXT: vmv.s.x v10, a0 +; ZVFBFWMA-NEXT: vfwmaccbf16.vv v8, v10, v9 ; ZVFBFWMA-NEXT: ret ; -; ZVFBMIN32-LABEL: vfwmaccbf16_vf_v1f32: -; ZVFBMIN32: # %bb.0: -; ZVFBMIN32-NEXT: addi sp, sp, -48 -; ZVFBMIN32-NEXT: .cfi_def_cfa_offset 48 -; ZVFBMIN32-NEXT: sw ra, 44(sp) # 4-byte Folded Spill -; ZVFBMIN32-NEXT: fsd fs0, 32(sp) # 8-byte Folded Spill -; ZVFBMIN32-NEXT: .cfi_offset ra, -4 -; ZVFBMIN32-NEXT: .cfi_offset fs0, -16 -; ZVFBMIN32-NEXT: csrr a0, vlenb -; ZVFBMIN32-NEXT: sub sp, sp, a0 -; ZVFBMIN32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 1 * vlenb -; ZVFBMIN32-NEXT: fmv.s fs0, fa0 -; ZVFBMIN32-NEXT: addi a0, sp, 32 -; ZVFBMIN32-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill -; ZVFBMIN32-NEXT: fmv.s fa0, fa1 -; ZVFBMIN32-NEXT: call __truncsfbf2 -; ZVFBMIN32-NEXT: fmv.x.w a0, fa0 -; ZVFBMIN32-NEXT: fmv.x.w a1, fs0 -; ZVFBMIN32-NEXT: slli a1, a1, 16 -; ZVFBMIN32-NEXT: sw a1, 8(sp) -; ZVFBMIN32-NEXT: addi a1, sp, 8 -; ZVFBMIN32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFBMIN32-NEXT: vle32.v v10, (a1) -; ZVFBMIN32-NEXT: slli a0, a0, 16 -; ZVFBMIN32-NEXT: sw a0, 12(sp) -; ZVFBMIN32-NEXT: addi a0, sp, 12 -; ZVFBMIN32-NEXT: vle32.v v9, (a0) -; ZVFBMIN32-NEXT: addi a0, sp, 32 -; ZVFBMIN32-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload -; ZVFBMIN32-NEXT: vfmacc.vv v8, v10, v9 -; ZVFBMIN32-NEXT: csrr a0, vlenb -; ZVFBMIN32-NEXT: add sp, sp, a0 -; ZVFBMIN32-NEXT: lw ra, 44(sp) # 4-byte Folded Reload -; ZVFBMIN32-NEXT: fld fs0, 32(sp) # 8-byte Folded Reload -; ZVFBMIN32-NEXT: addi sp, sp, 48 -; ZVFBMIN32-NEXT: ret -; -; ZVFBMIN64-LABEL: vfwmaccbf16_vf_v1f32: -; ZVFBMIN64: # %bb.0: -; ZVFBMIN64-NEXT: addi sp, sp, -48 -; ZVFBMIN64-NEXT: .cfi_def_cfa_offset 48 -; ZVFBMIN64-NEXT: sd ra, 40(sp) # 8-byte Folded Spill -; ZVFBMIN64-NEXT: fsd fs0, 32(sp) # 8-byte Folded Spill -; ZVFBMIN64-NEXT: .cfi_offset ra, -8 -; ZVFBMIN64-NEXT: .cfi_offset fs0, -16 -; ZVFBMIN64-NEXT: csrr a0, vlenb -; ZVFBMIN64-NEXT: sub sp, sp, a0 -; ZVFBMIN64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 1 * vlenb -; ZVFBMIN64-NEXT: fmv.s fs0, fa0 -; ZVFBMIN64-NEXT: addi a0, sp, 32 -; ZVFBMIN64-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill -; ZVFBMIN64-NEXT: fmv.s fa0, fa1 -; ZVFBMIN64-NEXT: call __truncsfbf2 -; ZVFBMIN64-NEXT: fmv.x.w a0, fa0 -; ZVFBMIN64-NEXT: fmv.x.w a1, fs0 -; ZVFBMIN64-NEXT: slli a1, a1, 16 -; ZVFBMIN64-NEXT: fmv.w.x fa5, a1 -; ZVFBMIN64-NEXT: fsw fa5, 24(sp) -; ZVFBMIN64-NEXT: addi a1, sp, 24 -; ZVFBMIN64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma -; ZVFBMIN64-NEXT: vle32.v v10, (a1) -; ZVFBMIN64-NEXT: slli a0, a0, 16 -; ZVFBMIN64-NEXT: fmv.w.x fa5, a0 -; ZVFBMIN64-NEXT: fsw fa5, 28(sp) -; ZVFBMIN64-NEXT: addi a0, sp, 28 -; ZVFBMIN64-NEXT: vle32.v v9, (a0) -; ZVFBMIN64-NEXT: addi a0, sp, 32 -; ZVFBMIN64-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload -; ZVFBMIN64-NEXT: vfmacc.vv v8, v10, v9 -; ZVFBMIN64-NEXT: csrr a0, vlenb -; ZVFBMIN64-NEXT: add sp, sp, a0 -; ZVFBMIN64-NEXT: ld ra, 40(sp) # 8-byte Folded Reload -; ZVFBMIN64-NEXT: fld fs0, 32(sp) # 8-byte Folded Reload -; ZVFBMIN64-NEXT: addi sp, sp, 48 -; ZVFBMIN64-NEXT: ret +; ZVFBFMIN-LABEL: vfwmaccbf16_vf_v1f32: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: fmv.x.w a0, fa0 +; ZVFBFMIN-NEXT: vsetivli zero, 1, e16, mf4, ta, ma +; ZVFBFMIN-NEXT: vmv.s.x v10, a0 +; ZVFBFMIN-NEXT: vfwcvtbf16.f.f.v v11, v10 +; ZVFBFMIN-NEXT: vfwcvtbf16.f.f.v v10, v9 +; ZVFBFMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFBFMIN-NEXT: vfmacc.vv v8, v11, v10 +; ZVFBFMIN-NEXT: ret %b.head = insertelement <1 x bfloat> poison, bfloat %b, i32 0 %b.splat = shufflevector <1 x bfloat> %b.head, <1 x bfloat> poison, <1 x i32> zeroinitializer %b.ext = fpext <1 x bfloat> %b.splat to <1 x float> diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll index 151b91184bf42..4cc9a0124337d 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll @@ -1,13 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 -mattr=+v \ -; RUN: -riscv-v-slp-max-vf=0 -S \ -; RUN: | FileCheck %s --check-prefixes=CHECK,ZVL128 -; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 -mattr=+v,+zvl256b \ -; RUN: -riscv-v-slp-max-vf=0 -S \ -; RUN: | FileCheck %s --check-prefixes=CHECK,ZVL256 -; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 -mattr=+v,+zvl512b \ -; RUN: -riscv-v-slp-max-vf=0 -S \ -; RUN: | FileCheck %s --check-prefixes=CHECK,ZVL512 +; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 \ +; RUN: -mattr=+v,+zvfhmin,+zvfbfmin -riscv-v-slp-max-vf=0 -S \ +; RUN: | FileCheck %s --check-prefixes=CHECK,ZVFHMIN +; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 \ +; RUN: -mattr=+v,+zvfh,+zvfbfmin -riscv-v-slp-max-vf=0 -S \ +; RUN: | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVL128 +; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 \ +; RUN: -mattr=+v,+zvl256b,+zvfh,+zvfbfmin -riscv-v-slp-max-vf=0 -S \ +; RUN: | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVL256 +; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 \ +; RUN: -mattr=+v,+zvl512b,+zvfh,+zvfbfmin -riscv-v-slp-max-vf=0 -S \ +; RUN: | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVL512 target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128" target triple = "riscv64" @@ -332,6 +335,21 @@ entry: } define void @reduce_or_2() { +; ZVFHMIN-LABEL: @reduce_or_2( +; ZVFHMIN-NEXT: [[TMP1:%.*]] = shl i64 0, 0 +; ZVFHMIN-NEXT: [[TMP2:%.*]] = insertelement <16 x i64> , i64 [[TMP1]], i32 15 +; ZVFHMIN-NEXT: [[TMP3:%.*]] = icmp ult <16 x i64> [[TMP2]], zeroinitializer +; ZVFHMIN-NEXT: [[TMP4:%.*]] = insertelement <16 x i64> , i64 [[TMP1]], i32 6 +; ZVFHMIN-NEXT: [[TMP5:%.*]] = icmp ult <16 x i64> [[TMP4]], zeroinitializer +; ZVFHMIN-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]]) +; ZVFHMIN-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP5]]) +; ZVFHMIN-NEXT: [[OP_RDX:%.*]] = or i1 [[TMP6]], [[TMP7]] +; ZVFHMIN-NEXT: br i1 [[OP_RDX]], label [[TMP9:%.*]], label [[TMP8:%.*]] +; ZVFHMIN: 8: +; ZVFHMIN-NEXT: ret void +; ZVFHMIN: 9: +; ZVFHMIN-NEXT: ret void +; ; ZVL128-LABEL: @reduce_or_2( ; ZVL128-NEXT: [[TMP1:%.*]] = shl i64 0, 0 ; ZVL128-NEXT: [[TMP2:%.*]] = insertelement <16 x i64> , i64 [[TMP1]], i32 15 @@ -1152,3 +1170,128 @@ define i32 @reduce_sum_2arrays_b(ptr noalias noundef %x, ptr noalias %y) { %add10.3 = add nuw nsw i32 %add10.2, %conv9.3 ret i32 %add10.3 } + +; Shouldn't vectorize to a reduction because we can't promote it +define bfloat @fadd_4xbf16(ptr %p) { +; CHECK-LABEL: @fadd_4xbf16( +; CHECK-NEXT: [[X0:%.*]] = load bfloat, ptr [[P:%.*]], align 2 +; CHECK-NEXT: [[P1:%.*]] = getelementptr bfloat, ptr [[P]], i32 1 +; CHECK-NEXT: [[X1:%.*]] = load bfloat, ptr [[P1]], align 2 +; CHECK-NEXT: [[P2:%.*]] = getelementptr bfloat, ptr [[P]], i32 2 +; CHECK-NEXT: [[X2:%.*]] = load bfloat, ptr [[P2]], align 2 +; CHECK-NEXT: [[P3:%.*]] = getelementptr bfloat, ptr [[P]], i32 3 +; CHECK-NEXT: [[X3:%.*]] = load bfloat, ptr [[P3]], align 2 +; CHECK-NEXT: [[R0:%.*]] = fadd fast bfloat [[X0]], [[X1]] +; CHECK-NEXT: [[R1:%.*]] = fadd fast bfloat [[R0]], [[X2]] +; CHECK-NEXT: [[R2:%.*]] = fadd fast bfloat [[R1]], [[X3]] +; CHECK-NEXT: ret bfloat [[R2]] +; + %x0 = load bfloat, ptr %p + %p1 = getelementptr bfloat, ptr %p, i32 1 + %x1 = load bfloat, ptr %p1 + %p2 = getelementptr bfloat, ptr %p, i32 2 + %x2 = load bfloat, ptr %p2 + %p3 = getelementptr bfloat, ptr %p, i32 3 + %x3 = load bfloat, ptr %p3 + + %r0 = fadd fast bfloat %x0, %x1 + %r1 = fadd fast bfloat %r0, %x2 + %r2 = fadd fast bfloat %r1, %x3 + + ret bfloat %r2 +} + +; Shouldn't vectorize to a reduction because there's no vfred{u,o}mul.vs +define bfloat @fmul_4xbf16(ptr %p) { +; CHECK-LABEL: @fmul_4xbf16( +; CHECK-NEXT: [[X0:%.*]] = load bfloat, ptr [[P:%.*]], align 2 +; CHECK-NEXT: [[P1:%.*]] = getelementptr bfloat, ptr [[P]], i32 1 +; CHECK-NEXT: [[X1:%.*]] = load bfloat, ptr [[P1]], align 2 +; CHECK-NEXT: [[P2:%.*]] = getelementptr bfloat, ptr [[P]], i32 2 +; CHECK-NEXT: [[X2:%.*]] = load bfloat, ptr [[P2]], align 2 +; CHECK-NEXT: [[P3:%.*]] = getelementptr bfloat, ptr [[P]], i32 3 +; CHECK-NEXT: [[X3:%.*]] = load bfloat, ptr [[P3]], align 2 +; CHECK-NEXT: [[R0:%.*]] = fmul fast bfloat [[X0]], [[X1]] +; CHECK-NEXT: [[R1:%.*]] = fmul fast bfloat [[R0]], [[X2]] +; CHECK-NEXT: [[R2:%.*]] = fmul fast bfloat [[R1]], [[X3]] +; CHECK-NEXT: ret bfloat [[R2]] +; + %x0 = load bfloat, ptr %p + %p1 = getelementptr bfloat, ptr %p, i32 1 + %x1 = load bfloat, ptr %p1 + %p2 = getelementptr bfloat, ptr %p, i32 2 + %x2 = load bfloat, ptr %p2 + %p3 = getelementptr bfloat, ptr %p, i32 3 + %x3 = load bfloat, ptr %p3 + + %r0 = fmul fast bfloat %x0, %x1 + %r1 = fmul fast bfloat %r0, %x2 + %r2 = fmul fast bfloat %r1, %x3 + + ret bfloat %r2 +} + +; Shouldn't vectorize to a reduction on zvfhmin because we can't promote it +define half @fadd_4xf16(ptr %p) { +; ZVFHMIN-LABEL: @fadd_4xf16( +; ZVFHMIN-NEXT: [[X0:%.*]] = load half, ptr [[P:%.*]], align 2 +; ZVFHMIN-NEXT: [[P1:%.*]] = getelementptr half, ptr [[P]], i32 1 +; ZVFHMIN-NEXT: [[X1:%.*]] = load half, ptr [[P1]], align 2 +; ZVFHMIN-NEXT: [[P2:%.*]] = getelementptr half, ptr [[P]], i32 2 +; ZVFHMIN-NEXT: [[X2:%.*]] = load half, ptr [[P2]], align 2 +; ZVFHMIN-NEXT: [[P3:%.*]] = getelementptr half, ptr [[P]], i32 3 +; ZVFHMIN-NEXT: [[X3:%.*]] = load half, ptr [[P3]], align 2 +; ZVFHMIN-NEXT: [[R0:%.*]] = fadd fast half [[X0]], [[X1]] +; ZVFHMIN-NEXT: [[R1:%.*]] = fadd fast half [[R0]], [[X2]] +; ZVFHMIN-NEXT: [[R2:%.*]] = fadd fast half [[R1]], [[X3]] +; ZVFHMIN-NEXT: ret half [[R2]] +; +; ZVFH-LABEL: @fadd_4xf16( +; ZVFH-NEXT: [[TMP1:%.*]] = load <4 x half>, ptr [[P:%.*]], align 2 +; ZVFH-NEXT: [[TMP2:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP1]]) +; ZVFH-NEXT: ret half [[TMP2]] +; + %x0 = load half, ptr %p + %p1 = getelementptr half, ptr %p, i32 1 + %x1 = load half, ptr %p1 + %p2 = getelementptr half, ptr %p, i32 2 + %x2 = load half, ptr %p2 + %p3 = getelementptr half, ptr %p, i32 3 + %x3 = load half, ptr %p3 + + %r0 = fadd fast half %x0, %x1 + %r1 = fadd fast half %r0, %x2 + %r2 = fadd fast half %r1, %x3 + + ret half %r2 +} + +; Shouldn't vectorize to a reduction because there's no vfred{u,o}mul.vs +define half @fmul_4xf16(ptr %p) { +; CHECK-LABEL: @fmul_4xf16( +; CHECK-NEXT: [[X0:%.*]] = load half, ptr [[P:%.*]], align 2 +; CHECK-NEXT: [[P1:%.*]] = getelementptr half, ptr [[P]], i32 1 +; CHECK-NEXT: [[X1:%.*]] = load half, ptr [[P1]], align 2 +; CHECK-NEXT: [[P2:%.*]] = getelementptr half, ptr [[P]], i32 2 +; CHECK-NEXT: [[X2:%.*]] = load half, ptr [[P2]], align 2 +; CHECK-NEXT: [[P3:%.*]] = getelementptr half, ptr [[P]], i32 3 +; CHECK-NEXT: [[X3:%.*]] = load half, ptr [[P3]], align 2 +; CHECK-NEXT: [[R0:%.*]] = fmul fast half [[X0]], [[X1]] +; CHECK-NEXT: [[R1:%.*]] = fmul fast half [[R0]], [[X2]] +; CHECK-NEXT: [[R2:%.*]] = fmul fast half [[R1]], [[X3]] +; CHECK-NEXT: ret half [[R2]] +; + %x0 = load half, ptr %p + %p1 = getelementptr half, ptr %p, i32 1 + %x1 = load half, ptr %p1 + %p2 = getelementptr half, ptr %p, i32 2 + %x2 = load half, ptr %p2 + %p3 = getelementptr half, ptr %p, i32 3 + %x3 = load half, ptr %p3 + + %r0 = fmul fast half %x0, %x1 + %r1 = fmul fast half %r0, %x2 + %r2 = fmul fast half %r1, %x3 + + ret half %r2 +}