@@ -6905,7 +6905,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
69056905; AVX512-NEXT: # ymm11 = mem[2,1,3,3,6,5,7,7]
69066906; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0
69076907; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm11 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535]
6908- ; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm0 ^ (zmm11 & (zmm22 ^ zmm0) )
6908+ ; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 | (zmm0 & ~zmm11 )
69096909; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
69106910; AVX512-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5]
69116911; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
@@ -6927,7 +6927,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
69276927; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
69286928; AVX512-NEXT: # ymm10 = mem[2,1,3,3,6,5,7,7]
69296929; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0
6930- ; AVX512-NEXT: vpternlogq {{.*#+}} zmm24 = zmm0 ^ (zmm11 & (zmm24 ^ zmm0) )
6930+ ; AVX512-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 | (zmm0 & ~zmm11 )
69316931; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
69326932; AVX512-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5]
69336933; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
@@ -6944,7 +6944,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
69446944; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[2,1,3,3,6,5,7,7]
69456945; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0
69466946; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
6947- ; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm0 ^ (zmm11 & (zmm10 ^ zmm0) )
6947+ ; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 | (zmm0 & ~zmm11 )
69486948; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm25[0,0,2,1,4,4,6,5]
69496949; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,2,2,3,4,6,6,7]
69506950; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -6968,7 +6968,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
69686968; AVX512-NEXT: vmovdqa 32(%rdi), %xmm0
69696969; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
69706970; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
6971- ; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm11 & (zmm4 ^ zmm1) )
6971+ ; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm1 & ~zmm11 )
69726972; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[0,0,2,1,4,4,6,5]
69736973; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7]
69746974; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
@@ -7035,7 +7035,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
70357035; AVX512-NEXT: # ymm25 = mem[2,1,3,3,6,5,7,7]
70367036; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14
70377037; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
7038- ; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm14 ^ (zmm11 & (zmm0 ^ zmm14) )
7038+ ; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm14 & ~zmm11 )
70397039; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero,xmm8[4],zero,zero,zero,xmm8[5],zero,zero,zero,xmm8[6],zero,zero,zero,xmm8[7],zero,zero,zero
70407040; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
70417041; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero,xmm12[4],zero,zero,zero,xmm12[5],zero,zero,zero,xmm12[6],zero,zero,zero,xmm12[7],zero,zero,zero
@@ -7057,7 +7057,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
70577057; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload
70587058; AVX512-NEXT: # ymm25 = mem[2,1,3,3,6,5,7,7]
70597059; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14
7060- ; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm14 ^ (zmm11 & (zmm8 ^ zmm14) )
7060+ ; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm14 & ~zmm11 )
70617061; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm30[0,0,2,1,4,4,6,5]
70627062; AVX512-NEXT: vpshufd {{.*#+}} ymm25 = ymm31[0,2,2,3,4,6,6,7]
70637063; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14
@@ -7070,7 +7070,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
70707070; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[0,1,1,3,4,5,5,7]
70717071; AVX512-NEXT: vpshufd {{.*#+}} ymm17 = ymm20[2,1,3,3,6,5,7,7]
70727072; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm14
7073- ; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm14 ^ (zmm11 & (zmm6 ^ zmm14) )
7073+ ; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 | (zmm14 & ~zmm11 )
70747074; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm23[0,0,2,1,4,4,6,5]
70757075; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7]
70767076; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm14, %zmm9
@@ -7083,7 +7083,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
70837083; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,5,7]
70847084; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[2,1,3,3,6,5,7,7]
70857085; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
7086- ; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm2 ^ (zmm11 & (zmm12 ^ zmm2) )
7086+ ; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 | (zmm2 & ~zmm11 )
70877087; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm16[0,0,2,1,4,4,6,5]
70887088; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7]
70897089; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
@@ -7589,7 +7589,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
75897589; AVX512DQ-NEXT: # ymm11 = mem[2,1,3,3,6,5,7,7]
75907590; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0
75917591; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} zmm11 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535]
7592- ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm0 ^ (zmm11 & (zmm22 ^ zmm0) )
7592+ ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 | (zmm0 & ~zmm11 )
75937593; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
75947594; AVX512DQ-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5]
75957595; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
@@ -7611,7 +7611,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
76117611; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
76127612; AVX512DQ-NEXT: # ymm10 = mem[2,1,3,3,6,5,7,7]
76137613; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0
7614- ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm24 = zmm0 ^ (zmm11 & (zmm24 ^ zmm0) )
7614+ ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 | (zmm0 & ~zmm11 )
76157615; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
76167616; AVX512DQ-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5]
76177617; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
@@ -7628,7 +7628,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
76287628; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[2,1,3,3,6,5,7,7]
76297629; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0
76307630; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
7631- ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm0 ^ (zmm11 & (zmm10 ^ zmm0) )
7631+ ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 | (zmm0 & ~zmm11 )
76327632; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm25[0,0,2,1,4,4,6,5]
76337633; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,2,2,3,4,6,6,7]
76347634; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -7652,7 +7652,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
76527652; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm0
76537653; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
76547654; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
7655- ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm11 & (zmm4 ^ zmm1) )
7655+ ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm1 & ~zmm11 )
76567656; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[0,0,2,1,4,4,6,5]
76577657; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7]
76587658; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
@@ -7719,7 +7719,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
77197719; AVX512DQ-NEXT: # ymm25 = mem[2,1,3,3,6,5,7,7]
77207720; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14
77217721; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
7722- ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm14 ^ (zmm11 & (zmm0 ^ zmm14) )
7722+ ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm14 & ~zmm11 )
77237723; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero,xmm8[4],zero,zero,zero,xmm8[5],zero,zero,zero,xmm8[6],zero,zero,zero,xmm8[7],zero,zero,zero
77247724; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
77257725; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero,xmm12[4],zero,zero,zero,xmm12[5],zero,zero,zero,xmm12[6],zero,zero,zero,xmm12[7],zero,zero,zero
@@ -7741,7 +7741,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
77417741; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload
77427742; AVX512DQ-NEXT: # ymm25 = mem[2,1,3,3,6,5,7,7]
77437743; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14
7744- ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm14 ^ (zmm11 & (zmm8 ^ zmm14) )
7744+ ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm14 & ~zmm11 )
77457745; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm30[0,0,2,1,4,4,6,5]
77467746; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm25 = ymm31[0,2,2,3,4,6,6,7]
77477747; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14
@@ -7754,7 +7754,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
77547754; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[0,1,1,3,4,5,5,7]
77557755; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm17 = ymm20[2,1,3,3,6,5,7,7]
77567756; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm14
7757- ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm14 ^ (zmm11 & (zmm6 ^ zmm14) )
7757+ ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 | (zmm14 & ~zmm11 )
77587758; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm23[0,0,2,1,4,4,6,5]
77597759; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7]
77607760; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm14, %zmm9
@@ -7767,7 +7767,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
77677767; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,5,7]
77687768; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[2,1,3,3,6,5,7,7]
77697769; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
7770- ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm2 ^ (zmm11 & (zmm12 ^ zmm2) )
7770+ ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 | (zmm2 & ~zmm11 )
77717771; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm16[0,0,2,1,4,4,6,5]
77727772; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7]
77737773; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
0 commit comments