@@ -1083,6 +1083,269 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
10831083 ret i32 %ret
10841084}
10851085
1086+ ; Multiple uses of the store chain AND stored value
1087+ define i32 @chain_reset_i256 (ptr %p0 , ptr %p1 , ptr %p2 , i32 %position ) nounwind {
1088+ ; X86-LABEL: chain_reset_i256:
1089+ ; X86: # %bb.0:
1090+ ; X86-NEXT: pushl %ebp
1091+ ; X86-NEXT: movl %esp, %ebp
1092+ ; X86-NEXT: pushl %ebx
1093+ ; X86-NEXT: pushl %edi
1094+ ; X86-NEXT: pushl %esi
1095+ ; X86-NEXT: andl $-16, %esp
1096+ ; X86-NEXT: subl $112, %esp
1097+ ; X86-NEXT: movzbl 20(%ebp), %ecx
1098+ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
1099+ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
1100+ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
1101+ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
1102+ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
1103+ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
1104+ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
1105+ ; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
1106+ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
1107+ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
1108+ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
1109+ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
1110+ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
1111+ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
1112+ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
1113+ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
1114+ ; X86-NEXT: movl %ecx, %eax
1115+ ; X86-NEXT: shrb $3, %al
1116+ ; X86-NEXT: andb $28, %al
1117+ ; X86-NEXT: negb %al
1118+ ; X86-NEXT: movsbl %al, %eax
1119+ ; X86-NEXT: movl 72(%esp,%eax), %edx
1120+ ; X86-NEXT: movl 76(%esp,%eax), %edi
1121+ ; X86-NEXT: movl %edi, %esi
1122+ ; X86-NEXT: shldl %cl, %edx, %esi
1123+ ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1124+ ; X86-NEXT: movl 68(%esp,%eax), %esi
1125+ ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1126+ ; X86-NEXT: shldl %cl, %esi, %edx
1127+ ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1128+ ; X86-NEXT: movl 80(%esp,%eax), %edx
1129+ ; X86-NEXT: movl 84(%esp,%eax), %ebx
1130+ ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1131+ ; X86-NEXT: shldl %cl, %edx, %ebx
1132+ ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1133+ ; X86-NEXT: shldl %cl, %edi, %edx
1134+ ; X86-NEXT: movl 64(%esp,%eax), %edi
1135+ ; X86-NEXT: movl 88(%esp,%eax), %esi
1136+ ; X86-NEXT: movl 92(%esp,%eax), %eax
1137+ ; X86-NEXT: shldl %cl, %esi, %eax
1138+ ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
1139+ ; X86-NEXT: shldl %cl, %ebx, %esi
1140+ ; X86-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
1141+ ; X86-NEXT: shll %cl, %edi
1142+ ; X86-NEXT: movl %edi, %ecx
1143+ ; X86-NEXT: movl %esi, %ebx
1144+ ; X86-NEXT: notl %ebx
1145+ ; X86-NEXT: notl %eax
1146+ ; X86-NEXT: notl %edx
1147+ ; X86-NEXT: notl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
1148+ ; X86-NEXT: notl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
1149+ ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
1150+ ; X86-NEXT: notl %edi
1151+ ; X86-NEXT: notl %ecx
1152+ ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1153+ ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
1154+ ; X86-NEXT: notl %esi
1155+ ; X86-NEXT: movl 8(%ebp), %ecx
1156+ ; X86-NEXT: andl 12(%ecx), %edi
1157+ ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1158+ ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
1159+ ; X86-NEXT: andl 8(%ecx), %edi
1160+ ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1161+ ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
1162+ ; X86-NEXT: andl 20(%ecx), %edi
1163+ ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1164+ ; X86-NEXT: andl 16(%ecx), %edx
1165+ ; X86-NEXT: andl 28(%ecx), %eax
1166+ ; X86-NEXT: andl 24(%ecx), %ebx
1167+ ; X86-NEXT: andl 4(%ecx), %esi
1168+ ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1169+ ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
1170+ ; X86-NEXT: andl (%ecx), %esi
1171+ ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1172+ ; X86-NEXT: movl %ebx, 24(%ecx)
1173+ ; X86-NEXT: movl %eax, 28(%ecx)
1174+ ; X86-NEXT: movl %edx, 16(%ecx)
1175+ ; X86-NEXT: movl %edi, 20(%ecx)
1176+ ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
1177+ ; X86-NEXT: movl %edi, 8(%ecx)
1178+ ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
1179+ ; X86-NEXT: movl %edi, 12(%ecx)
1180+ ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
1181+ ; X86-NEXT: movl %esi, (%ecx)
1182+ ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
1183+ ; X86-NEXT: movl %esi, 4(%ecx)
1184+ ; X86-NEXT: orl %edi, %eax
1185+ ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
1186+ ; X86-NEXT: orl %eax, %esi
1187+ ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
1188+ ; X86-NEXT: movl 12(%ebp), %eax
1189+ ; X86-NEXT: movl (%eax), %ecx
1190+ ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
1191+ ; X86-NEXT: movl %edi, (%eax)
1192+ ; X86-NEXT: orl %edi, %edx
1193+ ; X86-NEXT: orl %ebx, %edx
1194+ ; X86-NEXT: orl %esi, %edx
1195+ ; X86-NEXT: movl 16(%ebp), %eax
1196+ ; X86-NEXT: movl (%eax), %eax
1197+ ; X86-NEXT: jne .LBB23_2
1198+ ; X86-NEXT: # %bb.1:
1199+ ; X86-NEXT: addl %ecx, %eax
1200+ ; X86-NEXT: .LBB23_2:
1201+ ; X86-NEXT: leal -12(%ebp), %esp
1202+ ; X86-NEXT: popl %esi
1203+ ; X86-NEXT: popl %edi
1204+ ; X86-NEXT: popl %ebx
1205+ ; X86-NEXT: popl %ebp
1206+ ; X86-NEXT: retl
1207+ ;
1208+ ; SSE-LABEL: chain_reset_i256:
1209+ ; SSE: # %bb.0:
1210+ ; SSE-NEXT: xorps %xmm0, %xmm0
1211+ ; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
1212+ ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
1213+ ; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
1214+ ; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp)
1215+ ; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp)
1216+ ; SSE-NEXT: movl %ecx, %eax
1217+ ; SSE-NEXT: shrb $3, %al
1218+ ; SSE-NEXT: andb $24, %al
1219+ ; SSE-NEXT: negb %al
1220+ ; SSE-NEXT: movsbq %al, %r10
1221+ ; SSE-NEXT: movq -24(%rsp,%r10), %r8
1222+ ; SSE-NEXT: movq -16(%rsp,%r10), %rax
1223+ ; SSE-NEXT: shldq %cl, %r8, %rax
1224+ ; SSE-NEXT: movq -32(%rsp,%r10), %r9
1225+ ; SSE-NEXT: shldq %cl, %r9, %r8
1226+ ; SSE-NEXT: movq -40(%rsp,%r10), %r10
1227+ ; SSE-NEXT: shldq %cl, %r10, %r9
1228+ ; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
1229+ ; SSE-NEXT: shlq %cl, %r10
1230+ ; SSE-NEXT: notq %r8
1231+ ; SSE-NEXT: notq %rax
1232+ ; SSE-NEXT: notq %r10
1233+ ; SSE-NEXT: notq %r9
1234+ ; SSE-NEXT: andq 24(%rdi), %rax
1235+ ; SSE-NEXT: andq 16(%rdi), %r8
1236+ ; SSE-NEXT: andq 8(%rdi), %r9
1237+ ; SSE-NEXT: andq (%rdi), %r10
1238+ ; SSE-NEXT: movq %r8, 16(%rdi)
1239+ ; SSE-NEXT: movq %rax, 24(%rdi)
1240+ ; SSE-NEXT: movq %r10, (%rdi)
1241+ ; SSE-NEXT: movq %r9, 8(%rdi)
1242+ ; SSE-NEXT: orq %rax, %r9
1243+ ; SSE-NEXT: orq %r10, %r8
1244+ ; SSE-NEXT: movl (%rsi), %eax
1245+ ; SSE-NEXT: movl %r10d, (%rsi)
1246+ ; SSE-NEXT: movl (%rdx), %ecx
1247+ ; SSE-NEXT: addl %ecx, %eax
1248+ ; SSE-NEXT: orq %r9, %r8
1249+ ; SSE-NEXT: cmovnel %ecx, %eax
1250+ ; SSE-NEXT: retq
1251+ ;
1252+ ; AVX2-LABEL: chain_reset_i256:
1253+ ; AVX2: # %bb.0:
1254+ ; AVX2-NEXT: # kill: def $ecx killed $ecx def $rcx
1255+ ; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
1256+ ; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
1257+ ; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
1258+ ; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
1259+ ; AVX2-NEXT: movl %ecx, %eax
1260+ ; AVX2-NEXT: shrb $3, %al
1261+ ; AVX2-NEXT: andb $24, %al
1262+ ; AVX2-NEXT: negb %al
1263+ ; AVX2-NEXT: movsbq %al, %rax
1264+ ; AVX2-NEXT: movq -32(%rsp,%rax), %r8
1265+ ; AVX2-NEXT: movq -24(%rsp,%rax), %r9
1266+ ; AVX2-NEXT: movq %r9, %r10
1267+ ; AVX2-NEXT: shldq %cl, %r8, %r10
1268+ ; AVX2-NEXT: movq -40(%rsp,%rax), %r11
1269+ ; AVX2-NEXT: movq -16(%rsp,%rax), %rax
1270+ ; AVX2-NEXT: shldq %cl, %r9, %rax
1271+ ; AVX2-NEXT: shldq %cl, %r11, %r8
1272+ ; AVX2-NEXT: andnq 24(%rdi), %rax, %rax
1273+ ; AVX2-NEXT: andnq 16(%rdi), %r10, %r9
1274+ ; AVX2-NEXT: andnq 8(%rdi), %r8, %r8
1275+ ; AVX2-NEXT: shlxq %rcx, %r11, %rcx
1276+ ; AVX2-NEXT: andnq (%rdi), %rcx, %rcx
1277+ ; AVX2-NEXT: movq %r9, 16(%rdi)
1278+ ; AVX2-NEXT: movq %rax, 24(%rdi)
1279+ ; AVX2-NEXT: movq %rcx, (%rdi)
1280+ ; AVX2-NEXT: movq %r8, 8(%rdi)
1281+ ; AVX2-NEXT: orq %rax, %r8
1282+ ; AVX2-NEXT: orq %rcx, %r9
1283+ ; AVX2-NEXT: movl (%rsi), %eax
1284+ ; AVX2-NEXT: movl %ecx, (%rsi)
1285+ ; AVX2-NEXT: movl (%rdx), %ecx
1286+ ; AVX2-NEXT: addl %ecx, %eax
1287+ ; AVX2-NEXT: orq %r8, %r9
1288+ ; AVX2-NEXT: cmovnel %ecx, %eax
1289+ ; AVX2-NEXT: vzeroupper
1290+ ; AVX2-NEXT: retq
1291+ ;
1292+ ; AVX512-LABEL: chain_reset_i256:
1293+ ; AVX512: # %bb.0:
1294+ ; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
1295+ ; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
1296+ ; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
1297+ ; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
1298+ ; AVX512-NEXT: # kill: def $ecx killed $ecx def $rcx
1299+ ; AVX512-NEXT: movl %ecx, %eax
1300+ ; AVX512-NEXT: shrb $3, %al
1301+ ; AVX512-NEXT: andb $24, %al
1302+ ; AVX512-NEXT: negb %al
1303+ ; AVX512-NEXT: movsbq %al, %rax
1304+ ; AVX512-NEXT: movq -40(%rsp,%rax), %r8
1305+ ; AVX512-NEXT: movq -32(%rsp,%rax), %r9
1306+ ; AVX512-NEXT: movq -24(%rsp,%rax), %r10
1307+ ; AVX512-NEXT: movq %r10, %r11
1308+ ; AVX512-NEXT: shldq %cl, %r9, %r11
1309+ ; AVX512-NEXT: movq -16(%rsp,%rax), %rax
1310+ ; AVX512-NEXT: shldq %cl, %r10, %rax
1311+ ; AVX512-NEXT: shlxq %rcx, %r8, %r10
1312+ ; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
1313+ ; AVX512-NEXT: shldq %cl, %r8, %r9
1314+ ; AVX512-NEXT: andnq 24(%rdi), %rax, %rax
1315+ ; AVX512-NEXT: andnq 16(%rdi), %r11, %rcx
1316+ ; AVX512-NEXT: andnq 8(%rdi), %r9, %r8
1317+ ; AVX512-NEXT: andnq (%rdi), %r10, %r9
1318+ ; AVX512-NEXT: movq %rcx, 16(%rdi)
1319+ ; AVX512-NEXT: movq %rax, 24(%rdi)
1320+ ; AVX512-NEXT: movq %r9, (%rdi)
1321+ ; AVX512-NEXT: movq %r8, 8(%rdi)
1322+ ; AVX512-NEXT: orq %rax, %r8
1323+ ; AVX512-NEXT: orq %r9, %rcx
1324+ ; AVX512-NEXT: movl (%rsi), %eax
1325+ ; AVX512-NEXT: movl %r9d, (%rsi)
1326+ ; AVX512-NEXT: movl (%rdx), %edx
1327+ ; AVX512-NEXT: addl %edx, %eax
1328+ ; AVX512-NEXT: orq %r8, %rcx
1329+ ; AVX512-NEXT: cmovnel %edx, %eax
1330+ ; AVX512-NEXT: vzeroupper
1331+ ; AVX512-NEXT: retq
1332+ %rem = and i32 %position , 255
1333+ %ofs = zext nneg i32 %rem to i256
1334+ %bit = shl nuw i256 1 , %ofs
1335+ %ld0 = load i256 , ptr %p0
1336+ %msk = xor i256 %bit , -1
1337+ %res = and i256 %ld0 , %msk
1338+ store i256 %res , ptr %p0
1339+ %cmp = icmp ne i256 %res , 0
1340+ %ld1 = load i32 , ptr %p1
1341+ %trunc = trunc i256 %res to i32
1342+ store i32 %trunc , ptr %p1
1343+ %ld2 = load i32 , ptr %p2
1344+ %add = add i32 %ld1 , %ld2
1345+ %sel = select i1 %cmp , i32 %ld2 , i32 %add
1346+ ret i32 %sel
1347+ }
1348+
10861349; BTC/BT/BTS sequence on same i128
10871350define i1 @sequence_i128 (ptr %word , i32 %pos0 , i32 %pos1 , i32 %pos2 ) nounwind {
10881351; X86-LABEL: sequence_i128:
0 commit comments