|
| 1 | +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py |
| 2 | +; RUN: opt < %s -passes=instcombine -S | FileCheck %s |
| 3 | + |
| 4 | +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" |
| 5 | +target triple = "aarch64" |
| 6 | + |
| 7 | +; We can turn a tbl/tbx intrinsic into a shufflevector instruction if the mask |
| 8 | +; is constant and references 2 or fewer operands. |
| 9 | + |
| 10 | +; Basic tbl1 with all in-bounds indices should optimize to shufflevector. |
| 11 | +define <16 x i8> @tbl1_basic(<16 x i8> %a) { |
| 12 | +; CHECK-LABEL: @tbl1_basic( |
| 13 | +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>) |
| 14 | +; CHECK-NEXT: ret <16 x i8> [[TMP1]] |
| 15 | +; |
| 16 | + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>) |
| 17 | + ret <16 x i8> %tbl |
| 18 | +} |
| 19 | + |
| 20 | +; tbl2 with both operands the same should optimize (1 unique source). |
| 21 | +define <16 x i8> @tbl2_duplicate_operands(<16 x i8> %a) { |
| 22 | +; CHECK-LABEL: @tbl2_duplicate_operands( |
| 23 | +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[A]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>) |
| 24 | +; CHECK-NEXT: ret <16 x i8> [[TMP1]] |
| 25 | +; |
| 26 | + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>) |
| 27 | + ret <16 x i8> %tbl |
| 28 | +} |
| 29 | + |
| 30 | +; tbl4 with alternating duplicate operands should optimize (2 unique sources). |
| 31 | +define <16 x i8> @tbl4_duplicate_operands(<16 x i8> %a, <16 x i8> %b) { |
| 32 | +; CHECK-LABEL: @tbl4_duplicate_operands( |
| 33 | +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>) |
| 34 | +; CHECK-NEXT: ret <16 x i8> [[TMP1]] |
| 35 | +; |
| 36 | + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>) |
| 37 | + ret <16 x i8> %tbl |
| 38 | +} |
| 39 | + |
| 40 | +; tbl4 where mask only references first two operands should optimize. |
| 41 | +define <16 x i8> @tbl4_unused_operands(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { |
| 42 | +; CHECK-LABEL: @tbl4_unused_operands( |
| 43 | +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>) |
| 44 | +; CHECK-NEXT: ret <16 x i8> [[TMP1]] |
| 45 | +; |
| 46 | + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>) |
| 47 | + ret <16 x i8> %tbl |
| 48 | +} |
| 49 | + |
| 50 | +; tbl4 where mask only references one operand should optimize. |
| 51 | +define <16 x i8> @tbl4_single_operand_used(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { |
| 52 | +; CHECK-LABEL: @tbl4_single_operand_used( |
| 53 | +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>) |
| 54 | +; CHECK-NEXT: ret <16 x i8> [[TMP1]] |
| 55 | +; |
| 56 | + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>) |
| 57 | + ret <16 x i8> %tbl |
| 58 | +} |
| 59 | + |
| 60 | +; tbl1 with some OOB indices should optimize (1 source + zero vector = 2 sources). |
| 61 | +define <16 x i8> @tbl1_with_oob(<16 x i8> %a) { |
| 62 | +; CHECK-LABEL: @tbl1_with_oob( |
| 63 | +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>) |
| 64 | +; CHECK-NEXT: ret <16 x i8> [[TMP1]] |
| 65 | +; |
| 66 | + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>) |
| 67 | + ret <16 x i8> %tbl |
| 68 | +} |
| 69 | + |
| 70 | +; tbl2 with duplicate operands and OOB should optimize (1 unique source + zero vector = 2 sources). |
| 71 | +define <16 x i8> @tbl2_duplicate_with_oob(<16 x i8> %a) { |
| 72 | +; CHECK-LABEL: @tbl2_duplicate_with_oob( |
| 73 | +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[A]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>) |
| 74 | +; CHECK-NEXT: ret <16 x i8> [[TMP1]] |
| 75 | +; |
| 76 | + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>) |
| 77 | + ret <16 x i8> %tbl |
| 78 | +} |
| 79 | + |
| 80 | +; tbl2 with OOB indices should NOT optimize (2 sources + zero vector = 3 sources). |
| 81 | +define <16 x i8> @tbl2_with_oob_bail(<16 x i8> %a, <16 x i8> %b) { |
| 82 | +; CHECK-LABEL: @tbl2_with_oob_bail( |
| 83 | +; CHECK-NEXT: [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>) |
| 84 | +; CHECK-NEXT: ret <16 x i8> [[TBL]] |
| 85 | +; |
| 86 | + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>) |
| 87 | + ret <16 x i8> %tbl |
| 88 | +} |
| 89 | + |
| 90 | +; tbl1 with all OOB indices should optimize to zero vector. |
| 91 | +define <16 x i8> @tbl1_all_oob(<16 x i8> %a) { |
| 92 | +; CHECK-LABEL: @tbl1_all_oob( |
| 93 | +; CHECK-NEXT: [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> splat (i8 99)) |
| 94 | +; CHECK-NEXT: ret <16 x i8> [[TBL]] |
| 95 | +; |
| 96 | + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> <i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>) |
| 97 | + ret <16 x i8> %tbl |
| 98 | +} |
| 99 | + |
| 100 | +; tbl3 referencing all 3 operands should NOT optimize. |
| 101 | +define <16 x i8> @tbl3_three_sources_bail(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { |
| 102 | +; CHECK-LABEL: @tbl3_three_sources_bail( |
| 103 | +; CHECK-NEXT: [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 0, i8 0, i8 0, i8 0>) |
| 104 | +; CHECK-NEXT: ret <16 x i8> [[TBL]] |
| 105 | +; |
| 106 | + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 0, i8 0, i8 0, i8 0>) |
| 107 | + ret <16 x i8> %tbl |
| 108 | +} |
| 109 | + |
| 110 | +; tbl4 referencing 3 unique operands should NOT optimize. |
| 111 | +define <16 x i8> @tbl4_three_sources_bail(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { |
| 112 | +; CHECK-LABEL: @tbl4_three_sources_bail( |
| 113 | +; CHECK-NEXT: [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[A]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>) |
| 114 | +; CHECK-NEXT: ret <16 x i8> [[TBL]] |
| 115 | +; |
| 116 | + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>) |
| 117 | + ret <16 x i8> %tbl |
| 118 | +} |
| 119 | + |
| 120 | +; tbl4 referencing all 4 unique operands should NOT optimize. |
| 121 | +define <16 x i8> @tbl4_four_sources_bail(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { |
| 122 | +; CHECK-LABEL: @tbl4_four_sources_bail( |
| 123 | +; CHECK-NEXT: [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>) |
| 124 | +; CHECK-NEXT: ret <16 x i8> [[TBL]] |
| 125 | +; |
| 126 | + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>) |
| 127 | + ret <16 x i8> %tbl |
| 128 | +} |
| 129 | + |
| 130 | +; tbx1 with no OOB should optimize. |
| 131 | +define <16 x i8> @tbx1_no_oob(<16 x i8> %fallback, <16 x i8> %a) { |
| 132 | +; CHECK-LABEL: @tbx1_no_oob( |
| 133 | +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[FALLBACK:%.*]], <16 x i8> [[A:%.*]], <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>) |
| 134 | +; CHECK-NEXT: ret <16 x i8> [[TMP1]] |
| 135 | +; |
| 136 | + %tbx = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>) |
| 137 | + ret <16 x i8> %tbx |
| 138 | +} |
| 139 | + |
| 140 | +; tbx2 where fallback == second source operand should optimize (deduplicated). |
| 141 | +define <16 x i8> @tbx2_fallback_equals_second_source(<16 x i8> %a, <16 x i8> %b) { |
| 142 | +; CHECK-LABEL: @tbx2_fallback_equals_second_source( |
| 143 | +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[B:%.*]], <16 x i8> [[A:%.*]], <16 x i8> [[B]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>) |
| 144 | +; CHECK-NEXT: ret <16 x i8> [[TMP1]] |
| 145 | +; |
| 146 | + %tbx = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %b, <16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>) |
| 147 | + ret <16 x i8> %tbx |
| 148 | +} |
| 149 | + |
| 150 | +; tbx1 with OOB where fallback == source should optimize (deduplicated). |
| 151 | +define <16 x i8> @tbx1_oob_fallback_same_as_source(<16 x i8> %a) { |
| 152 | +; CHECK-LABEL: @tbx1_oob_fallback_same_as_source( |
| 153 | +; CHECK-NEXT: [[A:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[A1:%.*]], <16 x i8> [[A1]], <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>) |
| 154 | +; CHECK-NEXT: ret <16 x i8> [[A]] |
| 155 | +; |
| 156 | + %tbx = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>) |
| 157 | + ret <16 x i8> %tbx |
| 158 | +} |
| 159 | + |
| 160 | +; tbx2 with OOB should NOT optimize (2 sources + fallback = 3 sources). |
| 161 | +define <16 x i8> @tbx2_with_oob_bail(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> %b) { |
| 162 | +; CHECK-LABEL: @tbx2_with_oob_bail( |
| 163 | +; CHECK-NEXT: [[TBX:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[FALLBACK:%.*]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>) |
| 164 | +; CHECK-NEXT: ret <16 x i8> [[TBX]] |
| 165 | +; |
| 166 | + %tbx = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>) |
| 167 | + ret <16 x i8> %tbx |
| 168 | +} |
| 169 | + |
| 170 | +; tbx1 with all OOB indices should optimize to fallback. |
| 171 | +define <16 x i8> @tbx1_all_oob(<16 x i8> %fallback, <16 x i8> %a) { |
| 172 | +; CHECK-LABEL: @tbx1_all_oob( |
| 173 | +; CHECK-NEXT: [[FALLBACK:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[FALLBACK1:%.*]], <16 x i8> [[A:%.*]], <16 x i8> splat (i8 99)) |
| 174 | +; CHECK-NEXT: ret <16 x i8> [[FALLBACK]] |
| 175 | +; |
| 176 | + %tbx = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> <i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>) |
| 177 | + ret <16 x i8> %tbx |
| 178 | +} |
| 179 | + |
| 180 | +; tbx1 with OOB and mismatched fallback/source sizes should NOT optimize. |
| 181 | +define <8 x i8> @tbx1_fallback_size_mismatch(<8 x i8> %fallback, <16 x i8> %a) { |
| 182 | +; CHECK-LABEL: @tbx1_fallback_size_mismatch( |
| 183 | +; CHECK-NEXT: [[TBX:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[FALLBACK:%.*]], <16 x i8> [[A:%.*]], <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 99, i8 99, i8 99, i8 99>) |
| 184 | +; CHECK-NEXT: ret <8 x i8> [[TBX]] |
| 185 | +; |
| 186 | + %tbx = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %fallback, <16 x i8> %a, <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 99, i8 99, i8 99, i8 99>) |
| 187 | + ret <8 x i8> %tbx |
| 188 | +} |
| 189 | + |
| 190 | +; tbx1 with no OOB and mismatched fallback/source sizes should optimize. |
| 191 | +define <8 x i8> @tbx1_fallback_size_mismatch_no_oob(<8 x i8> %fallback, <16 x i8> %a) { |
| 192 | +; CHECK-LABEL: @tbx1_fallback_size_mismatch_no_oob( |
| 193 | +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[FALLBACK:%.*]], <16 x i8> [[A:%.*]], <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>) |
| 194 | +; CHECK-NEXT: ret <8 x i8> [[TMP1]] |
| 195 | +; |
| 196 | + %tbx = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %fallback, <16 x i8> %a, <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>) |
| 197 | + ret <8 x i8> %tbx |
| 198 | +} |
| 199 | + |
| 200 | +; tbl1 with non-i8 element type should NOT optimize. |
| 201 | +define <8 x i16> @tbl1_8x16(<16 x i8> %vec) { |
| 202 | +; CHECK-LABEL: @tbl1_8x16( |
| 203 | +; CHECK-NEXT: entry: |
| 204 | +; CHECK-NEXT: [[TBL1:%.*]] = call <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8> [[VEC:%.*]], <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>) |
| 205 | +; CHECK-NEXT: ret <8 x i16> [[TBL1]] |
| 206 | +; |
| 207 | +entry: |
| 208 | + %tbl1 = call <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8> %vec, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>) |
| 209 | + ret <8 x i16> %tbl1 |
| 210 | +} |
| 211 | +declare <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8>, <8 x i16>) |
| 212 | + |
| 213 | +; tbl1 with non-8/16 element count should NOT optimize. |
| 214 | +define <12 x i8> @tbl1_16x8(<16 x i8> %vec) { |
| 215 | +; CHECK-LABEL: @tbl1_16x8( |
| 216 | +; CHECK-NEXT: entry: |
| 217 | +; CHECK-NEXT: [[TBL1:%.*]] = call <12 x i8> @llvm.aarch64.neon.tbl1.v12i8(<16 x i8> [[VEC:%.*]], <12 x i8> <i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>) |
| 218 | +; CHECK-NEXT: ret <12 x i8> [[TBL1]] |
| 219 | +; |
| 220 | +entry: |
| 221 | + %tbl1 = call <12 x i8> @llvm.aarch64.neon.tbl1.v12i8(<16 x i8> %vec, <12 x i8> <i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>) |
| 222 | + ret <12 x i8> %tbl1 |
| 223 | +} |
| 224 | +declare <12 x i8> @llvm.aarch64.neon.tbl1.v12i8(<16 x i8>, <12 x i8>) |
| 225 | + |
| 226 | +; Non-constant mask should NOT optimize. |
| 227 | +define <16 x i8> @tbl1_non_constant_mask(<16 x i8> %a, <16 x i8> %mask) { |
| 228 | +; CHECK-LABEL: @tbl1_non_constant_mask( |
| 229 | +; CHECK-NEXT: [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[MASK:%.*]]) |
| 230 | +; CHECK-NEXT: ret <16 x i8> [[TBL]] |
| 231 | +; |
| 232 | + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> %mask) |
| 233 | + ret <16 x i8> %tbl |
| 234 | +} |
| 235 | + |
| 236 | +; Mask with some poison elements should optimize, with poison propagating to output. |
| 237 | +define <16 x i8> @tbl1_poison_mask_elements(<16 x i8> %a) { |
| 238 | +; CHECK-LABEL: @tbl1_poison_mask_elements( |
| 239 | +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> <i8 0, i8 poison, i8 2, i8 poison, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>) |
| 240 | +; CHECK-NEXT: ret <16 x i8> [[TMP1]] |
| 241 | +; |
| 242 | + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> <i8 0, i8 poison, i8 2, i8 poison, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>) |
| 243 | + ret <16 x i8> %tbl |
| 244 | +} |
| 245 | + |
| 246 | +; Mask with all poison elements should optimize to poison. |
| 247 | +define <16 x i8> @tbl1_all_poison_mask(<16 x i8> %a) { |
| 248 | +; CHECK-LABEL: @tbl1_all_poison_mask( |
| 249 | +; CHECK-NEXT: [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> poison) |
| 250 | +; CHECK-NEXT: ret <16 x i8> [[TBL]] |
| 251 | +; |
| 252 | + %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> poison) |
| 253 | + ret <16 x i8> %tbl |
| 254 | +} |
| 255 | + |
| 256 | +; "Real" declarations |
| 257 | +declare <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) nounwind readnone |
| 258 | +declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>) nounwind readnone |
| 259 | +declare <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone |
| 260 | +declare <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone |
| 261 | +declare <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone |
| 262 | +declare <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone |
| 263 | +declare <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone |
| 264 | +declare <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone |
| 265 | +declare <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>) nounwind readnone |
| 266 | +declare <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone |
| 267 | +declare <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone |
| 268 | +declare <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone |
| 269 | +declare <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone |
| 270 | +declare <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone |
| 271 | +declare <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone |
| 272 | +declare <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone |
0 commit comments