Skip to content

Commit 0c2f859

Browse files
brads55tstellar
authored andcommitted
Workaround incorrect types when lowering fixed length gather/scatter
When lowering a fixed length gather/scatter the index type is assumed to be the same as the memory type, this is incorrect in cases where the extension of the index has been folded into the addressing mode. For now add a temporary workaround to fix the codegen faults caused by this by preventing the removal of this extension. At a later date the lowering for SVE gather/scatters will be redesigned to improve the way addressing modes are handled. As a short term side effect of this change, the addressing modes generated for fixed length gather/scatters will not be optimal. Differential Revision: https://reviews.llvm.org/D109145 (cherry picked from commit 14e1a4a)
1 parent f17d60d commit 0c2f859

File tree

3 files changed

+112
-52
lines changed

3 files changed

+112
-52
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4161,7 +4161,8 @@ bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
41614161

41624162
bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const {
41634163
if (VT.getVectorElementType() == MVT::i32 &&
4164-
VT.getVectorElementCount().getKnownMinValue() >= 4)
4164+
VT.getVectorElementCount().getKnownMinValue() >= 4 &&
4165+
!VT.isFixedLengthVector())
41654166
return true;
41664167

41674168
return false;

llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll

Lines changed: 57 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -917,6 +917,7 @@ define void @masked_gather_v32f64(<32 x double>* %a, <32 x double*>* %b) #0 {
917917
; The above tests test the types, the below tests check that the addressing
918918
; modes still function
919919

920+
; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
920921
define void @masked_gather_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 {
921922
; CHECK-LABEL: masked_gather_32b_scaled_sext_f16:
922923
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32
@@ -925,11 +926,15 @@ define void @masked_gather_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b,
925926
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
926927
; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0
927928
; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[CMP]]/z, #-1
928-
; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h
929-
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0
930-
; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].s }, [[MASK]]/z, [x2, [[PTRS]].s, sxtw #1]
931-
; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].h, [[RES]].h, [[RES]].h
932-
; VBITS_GE_2048-NEXT: st1h { [[UZP]].h }, [[PG0]], [x0]
929+
; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h
930+
; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
931+
; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d, vl32
932+
; VBITS_GE_2048-NEXT: sunpklo [[SEXT:z[0-9]+]].d, [[PTRS]].s
933+
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG2]]/z, [[UPK2]].d, #0
934+
; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[SEXT]].d, lsl #1]
935+
; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s
936+
; VBITS_GE_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[UZP1]].h, [[UZP1]].h
937+
; VBITS_GE_2048-NEXT: st1h { [[UZP2]].h }, [[PG0]], [x0]
933938
; VBITS_GE_2048-NEXT: ret
934939
%cvals = load <32 x half>, <32 x half>* %a
935940
%idxs = load <32 x i32>, <32 x i32>* %b
@@ -941,14 +946,21 @@ define void @masked_gather_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b,
941946
ret void
942947
}
943948

949+
; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
944950
define void @masked_gather_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b, float* %base) #0 {
945951
; CHECK-LABEL: masked_gather_32b_scaled_sext_f32:
946-
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl32
947-
; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG]]/z, [x0]
948-
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG]]/z, [x1]
949-
; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG]]/z, [[VALS]].s, #0.0
950-
; VBITS_GE_2048-NEXT: ld1w { [[RES:z[0-9]+]].s }, [[MASK]]/z, [x2, [[PTRS]].s, sxtw #2]
951-
; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
952+
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].s, vl32
953+
; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0]
954+
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG0]]/z, [x1]
955+
; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0
956+
; VBITS_GE_2048-NEXT: sunpklo [[SEXT:z[0-9]+]].d, [[PTRS]].s
957+
; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].s, [[CMP]]/z, #-1
958+
; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s
959+
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32
960+
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0
961+
; VBITS_GE_2048-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[SEXT]].d, lsl #2]
962+
; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s
963+
; VBITS_GE_2048-NEXT: st1w { [[UZP]].s }, [[PG0]], [x0]
952964
; VBITS_GE_2048-NEXT: ret
953965
%cvals = load <32 x float>, <32 x float>* %a
954966
%idxs = load <32 x i32>, <32 x i32>* %b
@@ -960,14 +972,16 @@ define void @masked_gather_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b,
960972
ret void
961973
}
962974

975+
; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
963976
define void @masked_gather_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* %b, double* %base) #0 {
964977
; CHECK-LABEL: masked_gather_32b_scaled_sext_f64:
965978
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].d, vl32
966-
; VBITS_GE_2048-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0]
967979
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32
980+
; VBITS_GE_2048-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0]
968981
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
969982
; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, #0.0
970-
; VBITS_GE_2048-NEXT: ld1d { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[PTRS]].d, sxtw #3]
983+
; VBITS_GE_2048-NEXT: sunpklo [[SEXT:z[0-9]+]].d, [[PTRS]].s
984+
; VBITS_GE_2048-NEXT: ld1d { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[SEXT]].d, lsl #3]
971985
; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG0]], [x0]
972986
; VBITS_GE_2048-NEXT: ret
973987
%cvals = load <32 x double>, <32 x double>* %a
@@ -980,6 +994,7 @@ define void @masked_gather_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* %b
980994
ret void
981995
}
982996

997+
; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
983998
define void @masked_gather_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 {
984999
; CHECK-LABEL: masked_gather_32b_scaled_zext:
9851000
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32
@@ -988,11 +1003,15 @@ define void @masked_gather_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half
9881003
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
9891004
; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0
9901005
; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[CMP]]/z, #-1
991-
; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h
992-
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0
993-
; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].s }, [[MASK]]/z, [x2, [[PTRS]].s, uxtw #1]
994-
; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].h, [[RES]].h, [[RES]].h
995-
; VBITS_GE_2048-NEXT: st1h { [[UZP]].h }, [[PG0]], [x0]
1006+
; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h
1007+
; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
1008+
; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d, vl32
1009+
; VBITS_GE_2048-NEXT: uunpklo [[ZEXT:z[0-9]+]].d, [[PTRS]].s
1010+
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG2]]/z, [[UPK2]].d, #0
1011+
; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[ZEXT]].d, lsl #1]
1012+
; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s
1013+
; VBITS_GE_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[UZP1]].h, [[UZP1]].h
1014+
; VBITS_GE_2048-NEXT: st1h { [[UZP2]].h }, [[PG0]], [x0]
9961015
; VBITS_GE_2048-NEXT: ret
9971016
%cvals = load <32 x half>, <32 x half>* %a
9981017
%idxs = load <32 x i32>, <32 x i32>* %b
@@ -1004,6 +1023,7 @@ define void @masked_gather_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half
10041023
ret void
10051024
}
10061025

1026+
; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
10071027
define void @masked_gather_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 {
10081028
; CHECK-LABEL: masked_gather_32b_unscaled_sext:
10091029
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32
@@ -1012,11 +1032,15 @@ define void @masked_gather_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8
10121032
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
10131033
; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0
10141034
; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[CMP]]/z, #-1
1015-
; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h
1016-
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0
1017-
; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].s }, [[MASK]]/z, [x2, [[PTRS]].s, sxtw]
1018-
; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].h, [[RES]].h, [[RES]].h
1019-
; VBITS_GE_2048-NEXT: st1h { [[UZP]].h }, [[PG0]], [x0]
1035+
; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h
1036+
; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
1037+
; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d, vl32
1038+
; VBITS_GE_2048-NEXT: sunpklo [[SEXT:z[0-9]+]].d, [[PTRS]].s
1039+
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG2]]/z, [[UPK2]].d, #0
1040+
; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[SEXT]].d]
1041+
; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s
1042+
; VBITS_GE_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[UZP1]].h, [[UZP1]].h
1043+
; VBITS_GE_2048-NEXT: st1h { [[UZP2]].h }, [[PG0]], [x0]
10201044
; VBITS_GE_2048-NEXT: ret
10211045
%cvals = load <32 x half>, <32 x half>* %a
10221046
%idxs = load <32 x i32>, <32 x i32>* %b
@@ -1029,6 +1053,7 @@ define void @masked_gather_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8
10291053
ret void
10301054
}
10311055

1056+
; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
10321057
define void @masked_gather_32b_unscaled_zext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 {
10331058
; CHECK-LABEL: masked_gather_32b_unscaled_zext:
10341059
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32
@@ -1037,11 +1062,15 @@ define void @masked_gather_32b_unscaled_zext(<32 x half>* %a, <32 x i32>* %b, i8
10371062
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
10381063
; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0
10391064
; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[CMP]]/z, #-1
1040-
; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h
1041-
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0
1042-
; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].s }, [[MASK]]/z, [x2, [[PTRS]].s, uxtw]
1043-
; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].h, [[RES]].h, [[RES]].h
1044-
; VBITS_GE_2048-NEXT: st1h { [[UZP]].h }, [[PG0]], [x0]
1065+
; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h
1066+
; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
1067+
; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d, vl32
1068+
; VBITS_GE_2048-NEXT: uunpklo [[ZEXT:z[0-9]+]].d, [[PTRS]].s
1069+
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG2]]/z, [[UPK2]].d, #0
1070+
; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[ZEXT]].d]
1071+
; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s
1072+
; VBITS_GE_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[UZP1]].h, [[UZP1]].h
1073+
; VBITS_GE_2048-NEXT: st1h { [[UZP2]].h }, [[PG0]], [x0]
10451074
; VBITS_GE_2048-NEXT: ret
10461075
%cvals = load <32 x half>, <32 x half>* %a
10471076
%idxs = load <32 x i32>, <32 x i32>* %b

llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll

Lines changed: 53 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -839,18 +839,24 @@ define void @masked_scatter_v32f64(<32 x double>* %a, <32 x double*>* %b) #0 {
839839

840840
; The above tests test the types, the below tests check that the addressing
841841
; modes still function
842+
843+
; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
842844
define void @masked_scatter_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 {
843845
; CHECK-LABEL: masked_scatter_32b_scaled_sext_f16:
844846
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32
845847
; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
846848
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32
847849
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
850+
; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d, vl32
848851
; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0
849852
; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[PG0]]/z, #-1
850-
; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h
851-
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0
852-
; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].s, [[VALS]].h
853-
; VBITS_GE_2048-NEXT: st1h { [[UPKV]].s }, [[MASK]], [x2, [[PTRS]].s, sxtw #1]
853+
; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h
854+
; VBITS_GE_2048-NEXT: uunpklo [[UPKV1:z[0-9]+]].s, [[VALS]].h
855+
; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
856+
; VBITS_GE_2048-NEXT: sunpklo [[SEXT:z[0-9]+]].d, [[PTRS]].s
857+
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK2]].d, #0
858+
; VBITS_GE_2048-NEXT: uunpklo [[UPKV2:z[0-9]+]].d, [[UPKV1]].s
859+
; VBITS_GE_2048-NEXT: st1h { [[UPKV2]].d }, [[MASK]], [x2, [[SEXT]].d, lsl #1]
854860
; VBITS_GE_2048-NEXT: ret
855861
%vals = load <32 x half>, <32 x half>* %a
856862
%idxs = load <32 x i32>, <32 x i32>* %b
@@ -861,13 +867,20 @@ define void @masked_scatter_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b,
861867
ret void
862868
}
863869

870+
; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
864871
define void @masked_scatter_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b, float* %base) #0 {
865872
; CHECK-LABEL: masked_scatter_32b_scaled_sext_f32:
866-
; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl32
867-
; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG]]/z, [x0]
868-
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG]]/z, [x1]
869-
; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG]]/z, [[VALS]].s, #0.0
870-
; VBITS_GE_2048-NEXT: st1w { [[VALS]].s }, [[MASK]], [x2, [[PTRS]].s, sxtw #2]
873+
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].s, vl32
874+
; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0]
875+
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG0]]/z, [x1]
876+
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32
877+
; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0
878+
; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].s, [[PG0]]/z, #-1
879+
; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s
880+
; VBITS_GE_2048-NEXT: sunpklo [[SEXT:z[0-9]+]].d, [[PTRS]].s
881+
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0
882+
; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].d, [[VALS]].s
883+
; VBITS_GE_2048-NEXT: st1w { [[UPKV]].d }, [[MASK]], [x2, [[SEXT]].d, lsl #2]
871884
; VBITS_GE_2048-NEXT: ret
872885
%vals = load <32 x float>, <32 x float>* %a
873886
%idxs = load <32 x i32>, <32 x i32>* %b
@@ -878,14 +891,16 @@ define void @masked_scatter_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b
878891
ret void
879892
}
880893

894+
; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
881895
define void @masked_scatter_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* %b, double* %base) #0 {
882896
; CHECK-LABEL: masked_scatter_32b_scaled_sext_f64:
883897
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].d, vl32
884-
; VBITS_GE_2048-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0]
885898
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32
899+
; VBITS_GE_2048-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0]
886900
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
887901
; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, #0.0
888-
; VBITS_GE_2048-NEXT: st1d { [[VALS]].d }, [[MASK]], [x2, [[PTRS]].d, sxtw #3]
902+
; VBITS_GE_2048-NEXT: sunpklo [[SEXT:z[0-9]+]].d, [[PTRS]].s
903+
; VBITS_GE_2048-NEXT: st1d { [[VALS]].d }, [[MASK]], [x2, [[SEXT]].d, lsl #3]
889904
; VBITS_GE_2048-NEXT: ret
890905
%vals = load <32 x double>, <32 x double>* %a
891906
%idxs = load <32 x i32>, <32 x i32>* %b
@@ -896,18 +911,23 @@ define void @masked_scatter_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* %
896911
ret void
897912
}
898913

914+
; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
899915
define void @masked_scatter_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 {
900916
; CHECK-LABEL: masked_scatter_32b_scaled_zext:
901917
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32
902918
; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
903919
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32
904920
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
921+
; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d, vl32
905922
; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0
906923
; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[PG0]]/z, #-1
907-
; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h
908-
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0
909-
; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].s, [[VALS]].h
910-
; VBITS_GE_2048-NEXT: st1h { [[UPKV]].s }, [[MASK]], [x2, [[PTRS]].s, uxtw #1]
924+
; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h
925+
; VBITS_GE_2048-NEXT: uunpklo [[UPKV1:z[0-9]+]].s, [[VALS]].h
926+
; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
927+
; VBITS_GE_2048-NEXT: uunpklo [[ZEXT:z[0-9]+]].d, [[PTRS]].s
928+
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG2]]/z, [[UPK2]].d, #0
929+
; VBITS_GE_2048-NEXT: uunpklo [[UPKV2:z[0-9]+]].d, [[UPKV1]].s
930+
; VBITS_GE_2048-NEXT: st1h { [[UPKV2]].d }, [[MASK]], [x2, [[ZEXT]].d, lsl #1]
911931
; VBITS_GE_2048-NEXT: ret
912932
%vals = load <32 x half>, <32 x half>* %a
913933
%idxs = load <32 x i32>, <32 x i32>* %b
@@ -918,18 +938,23 @@ define void @masked_scatter_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, hal
918938
ret void
919939
}
920940

941+
; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
921942
define void @masked_scatter_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 {
922943
; CHECK-LABEL: masked_scatter_32b_unscaled_sext:
923944
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32
924945
; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
925946
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32
926947
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
948+
; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d, vl32
927949
; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0
928950
; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[PG0]]/z, #-1
929-
; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h
930-
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0
931-
; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].s, [[VALS]].h
932-
; VBITS_GE_2048-NEXT: st1h { [[UPKV]].s }, [[MASK]], [x2, [[PTRS]].s, sxtw]
951+
; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h
952+
; VBITS_GE_2048-NEXT: uunpklo [[UPKV1:z[0-9]+]].s, [[VALS]].h
953+
; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
954+
; VBITS_GE_2048-NEXT: sunpklo [[SEXT:z[0-9]+]].d, [[PTRS]].s
955+
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG2]]/z, [[UPK2]].d, #0
956+
; VBITS_GE_2048-NEXT: uunpklo [[UPKV2:z[0-9]+]].d, [[UPKV1]].s
957+
; VBITS_GE_2048-NEXT: st1h { [[UPKV2]].d }, [[MASK]], [x2, [[SEXT]].d]
933958
; VBITS_GE_2048-NEXT: ret
934959
%vals = load <32 x half>, <32 x half>* %a
935960
%idxs = load <32 x i32>, <32 x i32>* %b
@@ -941,18 +966,23 @@ define void @masked_scatter_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i
941966
ret void
942967
}
943968

969+
; NOTE: This produces an non-optimal addressing mode due to a temporary workaround
944970
define void @masked_scatter_32b_unscaled_zext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 {
945971
; CHECK-LABEL: masked_scatter_32b_unscaled_zext:
946972
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32
947973
; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
948974
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32
949975
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
976+
; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d, vl32
950977
; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0
951978
; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[PG0]]/z, #-1
952-
; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h
953-
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0
954-
; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].s, [[VALS]].h
955-
; VBITS_GE_2048-NEXT: st1h { [[UPKV]].s }, [[MASK]], [x2, [[PTRS]].s, uxtw]
979+
; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h
980+
; VBITS_GE_2048-NEXT: uunpklo [[UPKV1:z[0-9]+]].s, [[VALS]].h
981+
; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
982+
; VBITS_GE_2048-NEXT: uunpklo [[ZEXT:z[0-9]+]].d, [[PTRS]].s
983+
; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG2]]/z, [[UPK2]].d, #0
984+
; VBITS_GE_2048-NEXT: uunpklo [[UPKV2:z[0-9]+]].d, [[UPKV1]].s
985+
; VBITS_GE_2048-NEXT: st1h { [[UPKV2]].d }, [[MASK]], [x2, [[ZEXT]].d]
956986
; VBITS_GE_2048-NEXT: ret
957987
%vals = load <32 x half>, <32 x half>* %a
958988
%idxs = load <32 x i32>, <32 x i32>* %b

0 commit comments

Comments
 (0)