@@ -4989,3 +4989,257 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
49894989 %ext = shufflevector <2 x i32 > %cvt , <2 x i32 > zeroinitializer , <4 x i32 > <i32 0 , i32 1 , i32 2 , i32 3 >
49904990 ret <4 x i32 > %ext
49914991}
4992+
4993+ define <4 x i32 > @fptosi_4f16_to_4i32 (<4 x half > %a ) nounwind {
4994+ ; AVX-LABEL: fptosi_4f16_to_4i32:
4995+ ; AVX: # %bb.0:
4996+ ; AVX-NEXT: subq $72, %rsp
4997+ ; AVX-NEXT: vmovdqa %xmm0, %xmm1
4998+ ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4999+ ; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
5000+ ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5001+ ; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
5002+ ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
5003+ ; AVX-NEXT: vpsrlq $48, %xmm1, %xmm0
5004+ ; AVX-NEXT: callq __extendhfsf2@PLT
5005+ ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5006+ ; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
5007+ ; AVX-NEXT: callq __extendhfsf2@PLT
5008+ ; AVX-NEXT: vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
5009+ ; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3]
5010+ ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
5011+ ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
5012+ ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5013+ ; AVX-NEXT: callq __extendhfsf2@PLT
5014+ ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5015+ ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5016+ ; AVX-NEXT: callq __extendhfsf2@PLT
5017+ ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5018+ ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
5019+ ; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
5020+ ; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
5021+ ; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
5022+ ; AVX-NEXT: addq $72, %rsp
5023+ ; AVX-NEXT: retq
5024+ ;
5025+ ; F16C-LABEL: fptosi_4f16_to_4i32:
5026+ ; F16C: # %bb.0:
5027+ ; F16C-NEXT: vcvtph2ps %xmm0, %ymm0
5028+ ; F16C-NEXT: vcvttps2dq %ymm0, %ymm0
5029+ ; F16C-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
5030+ ; F16C-NEXT: vzeroupper
5031+ ; F16C-NEXT: retq
5032+ ;
5033+ ; AVX512-LABEL: fptosi_4f16_to_4i32:
5034+ ; AVX512: # %bb.0:
5035+ ; AVX512-NEXT: vcvtph2ps %xmm0, %ymm0
5036+ ; AVX512-NEXT: vcvttps2dq %ymm0, %ymm0
5037+ ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
5038+ ; AVX512-NEXT: vzeroupper
5039+ ; AVX512-NEXT: retq
5040+ %cvt = fptosi <4 x half > %a to <4 x i32 >
5041+ ret <4 x i32 > %cvt
5042+ }
5043+
5044+ define <4 x i32 > @fptoui_2f16_to_4i32 (<2 x half > %a ) nounwind {
5045+ ; AVX1-LABEL: fptoui_2f16_to_4i32:
5046+ ; AVX1: # %bb.0:
5047+ ; AVX1-NEXT: subq $40, %rsp
5048+ ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
5049+ ; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5050+ ; AVX1-NEXT: callq __extendhfsf2@PLT
5051+ ; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
5052+ ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5053+ ; AVX1-NEXT: callq __extendhfsf2@PLT
5054+ ; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
5055+ ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
5056+ ; AVX1-NEXT: vcvttps2dq %xmm0, %xmm1
5057+ ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
5058+ ; AVX1-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
5059+ ; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0
5060+ ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
5061+ ; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
5062+ ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
5063+ ; AVX1-NEXT: addq $40, %rsp
5064+ ; AVX1-NEXT: retq
5065+ ;
5066+ ; AVX2-LABEL: fptoui_2f16_to_4i32:
5067+ ; AVX2: # %bb.0:
5068+ ; AVX2-NEXT: subq $40, %rsp
5069+ ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
5070+ ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5071+ ; AVX2-NEXT: callq __extendhfsf2@PLT
5072+ ; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
5073+ ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5074+ ; AVX2-NEXT: callq __extendhfsf2@PLT
5075+ ; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
5076+ ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
5077+ ; AVX2-NEXT: vcvttps2dq %xmm0, %xmm1
5078+ ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
5079+ ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
5080+ ; AVX2-NEXT: vsubps %xmm3, %xmm0, %xmm0
5081+ ; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0
5082+ ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
5083+ ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
5084+ ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
5085+ ; AVX2-NEXT: addq $40, %rsp
5086+ ; AVX2-NEXT: retq
5087+ ;
5088+ ; F16C-LABEL: fptoui_2f16_to_4i32:
5089+ ; F16C: # %bb.0:
5090+ ; F16C-NEXT: vpsrld $16, %xmm0, %xmm1
5091+ ; F16C-NEXT: vcvtph2ps %xmm1, %xmm1
5092+ ; F16C-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
5093+ ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0
5094+ ; F16C-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
5095+ ; F16C-NEXT: vcvttps2dq %xmm0, %xmm1
5096+ ; F16C-NEXT: vpsrad $31, %xmm1, %xmm2
5097+ ; F16C-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
5098+ ; F16C-NEXT: vcvttps2dq %xmm0, %xmm0
5099+ ; F16C-NEXT: vpand %xmm2, %xmm0, %xmm0
5100+ ; F16C-NEXT: vpor %xmm0, %xmm1, %xmm0
5101+ ; F16C-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
5102+ ; F16C-NEXT: retq
5103+ ;
5104+ ; AVX512F-LABEL: fptoui_2f16_to_4i32:
5105+ ; AVX512F: # %bb.0:
5106+ ; AVX512F-NEXT: vpsrld $16, %xmm0, %xmm1
5107+ ; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1
5108+ ; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
5109+ ; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0
5110+ ; AVX512F-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
5111+ ; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0
5112+ ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
5113+ ; AVX512F-NEXT: vzeroupper
5114+ ; AVX512F-NEXT: retq
5115+ ;
5116+ ; AVX512-FASTLANE-LABEL: fptoui_2f16_to_4i32:
5117+ ; AVX512-FASTLANE: # %bb.0:
5118+ ; AVX512-FASTLANE-NEXT: vpsrld $16, %xmm0, %xmm1
5119+ ; AVX512-FASTLANE-NEXT: vcvtph2ps %xmm1, %xmm1
5120+ ; AVX512-FASTLANE-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
5121+ ; AVX512-FASTLANE-NEXT: vcvtph2ps %xmm0, %xmm0
5122+ ; AVX512-FASTLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
5123+ ; AVX512-FASTLANE-NEXT: vcvttps2udq %xmm0, %xmm0
5124+ ; AVX512-FASTLANE-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
5125+ ; AVX512-FASTLANE-NEXT: retq
5126+ %cvt = fptoui <2 x half > %a to <2 x i32 >
5127+ %ext = shufflevector <2 x i32 > %cvt , <2 x i32 > zeroinitializer , <4 x i32 > <i32 0 , i32 1 , i32 2 , i32 3 >
5128+ ret <4 x i32 > %ext
5129+ }
5130+
5131+ define <4 x i32 > @fptoui_4f16_to_4i32 (<4 x half > %a ) nounwind {
5132+ ; AVX1-LABEL: fptoui_4f16_to_4i32:
5133+ ; AVX1: # %bb.0:
5134+ ; AVX1-NEXT: subq $72, %rsp
5135+ ; AVX1-NEXT: vmovdqa %xmm0, %xmm1
5136+ ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5137+ ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
5138+ ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5139+ ; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
5140+ ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
5141+ ; AVX1-NEXT: vpsrlq $48, %xmm1, %xmm0
5142+ ; AVX1-NEXT: callq __extendhfsf2@PLT
5143+ ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5144+ ; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
5145+ ; AVX1-NEXT: callq __extendhfsf2@PLT
5146+ ; AVX1-NEXT: vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
5147+ ; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3]
5148+ ; AVX1-NEXT: vcvttps2dq %xmm0, %xmm1
5149+ ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
5150+ ; AVX1-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
5151+ ; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0
5152+ ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
5153+ ; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
5154+ ; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
5155+ ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5156+ ; AVX1-NEXT: callq __extendhfsf2@PLT
5157+ ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5158+ ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5159+ ; AVX1-NEXT: callq __extendhfsf2@PLT
5160+ ; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5161+ ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
5162+ ; AVX1-NEXT: vcvttps2dq %xmm0, %xmm1
5163+ ; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
5164+ ; AVX1-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
5165+ ; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0
5166+ ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
5167+ ; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
5168+ ; AVX1-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
5169+ ; AVX1-NEXT: # xmm0 = xmm0[0],mem[0]
5170+ ; AVX1-NEXT: addq $72, %rsp
5171+ ; AVX1-NEXT: retq
5172+ ;
5173+ ; AVX2-LABEL: fptoui_4f16_to_4i32:
5174+ ; AVX2: # %bb.0:
5175+ ; AVX2-NEXT: subq $72, %rsp
5176+ ; AVX2-NEXT: vmovdqa %xmm0, %xmm1
5177+ ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5178+ ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
5179+ ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5180+ ; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
5181+ ; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
5182+ ; AVX2-NEXT: vpsrlq $48, %xmm1, %xmm0
5183+ ; AVX2-NEXT: callq __extendhfsf2@PLT
5184+ ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5185+ ; AVX2-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
5186+ ; AVX2-NEXT: callq __extendhfsf2@PLT
5187+ ; AVX2-NEXT: vinsertps $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
5188+ ; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[2,3]
5189+ ; AVX2-NEXT: vcvttps2dq %xmm0, %xmm1
5190+ ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
5191+ ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
5192+ ; AVX2-NEXT: vsubps %xmm3, %xmm0, %xmm0
5193+ ; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0
5194+ ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
5195+ ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
5196+ ; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
5197+ ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5198+ ; AVX2-NEXT: callq __extendhfsf2@PLT
5199+ ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5200+ ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5201+ ; AVX2-NEXT: callq __extendhfsf2@PLT
5202+ ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5203+ ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
5204+ ; AVX2-NEXT: vcvttps2dq %xmm0, %xmm1
5205+ ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
5206+ ; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
5207+ ; AVX2-NEXT: vsubps %xmm3, %xmm0, %xmm0
5208+ ; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0
5209+ ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
5210+ ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
5211+ ; AVX2-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
5212+ ; AVX2-NEXT: # xmm0 = xmm0[0],mem[0]
5213+ ; AVX2-NEXT: addq $72, %rsp
5214+ ; AVX2-NEXT: retq
5215+ ;
5216+ ; F16C-LABEL: fptoui_4f16_to_4i32:
5217+ ; F16C: # %bb.0:
5218+ ; F16C-NEXT: vcvtph2ps %xmm0, %ymm0
5219+ ; F16C-NEXT: vcvttps2dq %ymm0, %ymm1
5220+ ; F16C-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
5221+ ; F16C-NEXT: vcvttps2dq %ymm0, %ymm0
5222+ ; F16C-NEXT: vorps %ymm0, %ymm1, %ymm0
5223+ ; F16C-NEXT: vblendvps %ymm1, %ymm0, %ymm1, %ymm0
5224+ ; F16C-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
5225+ ; F16C-NEXT: vzeroupper
5226+ ; F16C-NEXT: retq
5227+ ;
5228+ ; AVX512F-LABEL: fptoui_4f16_to_4i32:
5229+ ; AVX512F: # %bb.0:
5230+ ; AVX512F-NEXT: vcvtph2ps %xmm0, %ymm0
5231+ ; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0
5232+ ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
5233+ ; AVX512F-NEXT: vzeroupper
5234+ ; AVX512F-NEXT: retq
5235+ ;
5236+ ; AVX512-FASTLANE-LABEL: fptoui_4f16_to_4i32:
5237+ ; AVX512-FASTLANE: # %bb.0:
5238+ ; AVX512-FASTLANE-NEXT: vcvtph2ps %xmm0, %ymm0
5239+ ; AVX512-FASTLANE-NEXT: vcvttps2udq %ymm0, %ymm0
5240+ ; AVX512-FASTLANE-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
5241+ ; AVX512-FASTLANE-NEXT: vzeroupper
5242+ ; AVX512-FASTLANE-NEXT: retq
5243+ %cvt = fptoui <4 x half > %a to <4 x i32 >
5244+ ret <4 x i32 > %cvt
5245+ }
0 commit comments