@@ -10694,3 +10694,194 @@ define <2 x i64> @v_udiv_i64_exact(<2 x i64> %num) {
10694
10694
%result = udiv exact <2 x i64 > %num , <i64 4096 , i64 1024 >
10695
10695
ret <2 x i64 > %result
10696
10696
}
10697
+
10698
+ define i64 @udiv_i64_gt_smax (i8 %size ) {
10699
+ ; GFX6-LABEL: udiv_i64_gt_smax:
10700
+ ; GFX6: ; %bb.0:
10701
+ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10702
+ ; GFX6-NEXT: s_add_u32 s4, 0, 0x99986000
10703
+ ; GFX6-NEXT: v_mul_hi_u32 v2, s4, -10
10704
+ ; GFX6-NEXT: s_addc_u32 s5, 0, 0x59
10705
+ ; GFX6-NEXT: s_add_i32 s5, s5, 0x19999940
10706
+ ; GFX6-NEXT: s_mul_i32 s6, s4, -10
10707
+ ; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s4, v2
10708
+ ; GFX6-NEXT: s_mul_i32 s7, s5, -10
10709
+ ; GFX6-NEXT: v_mov_b32_e32 v3, s6
10710
+ ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s7, v2
10711
+ ; GFX6-NEXT: v_mul_hi_u32 v4, s5, v3
10712
+ ; GFX6-NEXT: v_mul_lo_u32 v5, s4, v2
10713
+ ; GFX6-NEXT: v_mul_hi_u32 v3, s4, v3
10714
+ ; GFX6-NEXT: v_mul_hi_u32 v6, s4, v2
10715
+ ; GFX6-NEXT: s_mul_i32 s6, s5, s6
10716
+ ; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8
10717
+ ; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5
10718
+ ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc
10719
+ ; GFX6-NEXT: v_mul_hi_u32 v6, s5, v2
10720
+ ; GFX6-NEXT: v_mul_lo_u32 v2, s5, v2
10721
+ ; GFX6-NEXT: v_add_i32_e32 v3, vcc, s6, v3
10722
+ ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc
10723
+ ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc
10724
+ ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2
10725
+ ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
10726
+ ; GFX6-NEXT: v_mov_b32_e32 v4, s5
10727
+ ; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v2
10728
+ ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v0
10729
+ ; GFX6-NEXT: v_not_b32_e32 v0, v0
10730
+ ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc
10731
+ ; GFX6-NEXT: v_mul_lo_u32 v4, v0, v3
10732
+ ; GFX6-NEXT: v_mul_hi_u32 v5, v0, v2
10733
+ ; GFX6-NEXT: v_mul_hi_u32 v6, v0, v3
10734
+ ; GFX6-NEXT: v_not_b32_e32 v1, v1
10735
+ ; GFX6-NEXT: v_mul_hi_u32 v7, v1, v3
10736
+ ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4
10737
+ ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc
10738
+ ; GFX6-NEXT: v_mul_lo_u32 v6, v1, v2
10739
+ ; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2
10740
+ ; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3
10741
+ ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6
10742
+ ; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc
10743
+ ; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc
10744
+ ; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3
10745
+ ; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
10746
+ ; GFX6-NEXT: v_mul_lo_u32 v4, v3, 10
10747
+ ; GFX6-NEXT: v_mul_hi_u32 v5, v2, 10
10748
+ ; GFX6-NEXT: v_mul_lo_u32 v6, v2, 10
10749
+ ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4
10750
+ ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
10751
+ ; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc
10752
+ ; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 10, v0
10753
+ ; GFX6-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v1, vcc
10754
+ ; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, 9, v4
10755
+ ; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
10756
+ ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
10757
+ ; GFX6-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
10758
+ ; GFX6-NEXT: v_add_i32_e32 v5, vcc, 2, v2
10759
+ ; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v3, vcc
10760
+ ; GFX6-NEXT: v_add_i32_e32 v7, vcc, 1, v2
10761
+ ; GFX6-NEXT: v_cmp_lt_u32_e64 s[4:5], 9, v0
10762
+ ; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc
10763
+ ; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
10764
+ ; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
10765
+ ; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
10766
+ ; GFX6-NEXT: v_cndmask_b32_e64 v0, -1, v0, s[4:5]
10767
+ ; GFX6-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc
10768
+ ; GFX6-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
10769
+ ; GFX6-NEXT: v_cndmask_b32_e32 v1, v8, v6, vcc
10770
+ ; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, v4, s[4:5]
10771
+ ; GFX6-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
10772
+ ; GFX6-NEXT: s_setpc_b64 s[30:31]
10773
+ ;
10774
+ ; GFX9-LABEL: udiv_i64_gt_smax:
10775
+ ; GFX9: ; %bb.0:
10776
+ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10777
+ ; GFX9-NEXT: s_add_u32 s4, 0, 0x99986000
10778
+ ; GFX9-NEXT: s_addc_u32 s5, 0, 0x59
10779
+ ; GFX9-NEXT: s_add_i32 s5, s5, 0x19999940
10780
+ ; GFX9-NEXT: s_mul_hi_u32 s9, s4, -10
10781
+ ; GFX9-NEXT: s_sub_i32 s9, s9, s4
10782
+ ; GFX9-NEXT: s_mul_i32 s10, s5, -10
10783
+ ; GFX9-NEXT: s_mul_i32 s6, s4, -10
10784
+ ; GFX9-NEXT: s_add_i32 s9, s9, s10
10785
+ ; GFX9-NEXT: s_mul_hi_u32 s7, s5, s6
10786
+ ; GFX9-NEXT: s_mul_i32 s8, s5, s6
10787
+ ; GFX9-NEXT: s_mul_i32 s11, s4, s9
10788
+ ; GFX9-NEXT: s_mul_hi_u32 s6, s4, s6
10789
+ ; GFX9-NEXT: s_mul_hi_u32 s10, s4, s9
10790
+ ; GFX9-NEXT: s_add_u32 s6, s6, s11
10791
+ ; GFX9-NEXT: s_addc_u32 s10, 0, s10
10792
+ ; GFX9-NEXT: s_add_u32 s6, s6, s8
10793
+ ; GFX9-NEXT: s_mul_hi_u32 s11, s5, s9
10794
+ ; GFX9-NEXT: s_addc_u32 s6, s10, s7
10795
+ ; GFX9-NEXT: s_addc_u32 s7, s11, 0
10796
+ ; GFX9-NEXT: s_mul_i32 s8, s5, s9
10797
+ ; GFX9-NEXT: s_add_u32 s6, s6, s8
10798
+ ; GFX9-NEXT: v_mov_b32_e32 v1, s6
10799
+ ; GFX9-NEXT: s_addc_u32 s7, 0, s7
10800
+ ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s4, v1
10801
+ ; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
10802
+ ; GFX9-NEXT: v_not_b32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
10803
+ ; GFX9-NEXT: s_addc_u32 s6, s5, s7
10804
+ ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v4, s6, 0
10805
+ ; GFX9-NEXT: v_mul_hi_u32 v6, v4, v5
10806
+ ; GFX9-NEXT: v_mov_b32_e32 v3, 31
10807
+ ; GFX9-NEXT: v_ashrrev_i32_sdwa v0, v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
10808
+ ; GFX9-NEXT: v_not_b32_e32 v7, v0
10809
+ ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v1
10810
+ ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v7, v5, 0
10811
+ ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v2, vcc
10812
+ ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v7, s6, 0
10813
+ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v0
10814
+ ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v8, v1, vcc
10815
+ ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
10816
+ ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v0, v2
10817
+ ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
10818
+ ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, 10, 0
10819
+ ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, 10, v[1:2]
10820
+ ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v4, v0
10821
+ ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v7, v1, vcc
10822
+ ; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, 10, v0
10823
+ ; GFX9-NEXT: v_subbrev_co_u32_e32 v4, vcc, 0, v1, vcc
10824
+ ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, 9, v2
10825
+ ; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
10826
+ ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
10827
+ ; GFX9-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
10828
+ ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 2, v3
10829
+ ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v5, vcc
10830
+ ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 1, v3
10831
+ ; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], 9, v0
10832
+ ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v5, vcc
10833
+ ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
10834
+ ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
10835
+ ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
10836
+ ; GFX9-NEXT: v_cndmask_b32_e64 v0, -1, v0, s[4:5]
10837
+ ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v4, vcc
10838
+ ; GFX9-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
10839
+ ; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v6, vcc
10840
+ ; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5]
10841
+ ; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5]
10842
+ ; GFX9-NEXT: s_setpc_b64 s[30:31]
10843
+ %esize = sext i8 %size to i64
10844
+ %minus = sub nuw nsw i64 -1 , %esize
10845
+ %div = udiv i64 %minus , 10
10846
+ ret i64 %div
10847
+ }
10848
+
10849
+ define i64 @udiv_i64_9divbits (i8 %size ) {
10850
+ ; GFX6-LABEL: udiv_i64_9divbits:
10851
+ ; GFX6: ; %bb.0:
10852
+ ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10853
+ ; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0
10854
+ ; GFX6-NEXT: v_add_i32_e32 v0, vcc, 1, v0
10855
+ ; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0
10856
+ ; GFX6-NEXT: s_mov_b32 s4, 0x41200000
10857
+ ; GFX6-NEXT: v_mul_f32_e32 v1, 0x3dcccccd, v0
10858
+ ; GFX6-NEXT: v_trunc_f32_e32 v1, v1
10859
+ ; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v1
10860
+ ; GFX6-NEXT: v_mad_f32 v0, -v1, s4, v0
10861
+ ; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4
10862
+ ; GFX6-NEXT: v_mov_b32_e32 v1, 0
10863
+ ; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc
10864
+ ; GFX6-NEXT: v_and_b32_e32 v0, 0x1ff, v0
10865
+ ; GFX6-NEXT: s_setpc_b64 s[30:31]
10866
+ ;
10867
+ ; GFX9-LABEL: udiv_i64_9divbits:
10868
+ ; GFX9: ; %bb.0:
10869
+ ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10870
+ ; GFX9-NEXT: v_mov_b32_e32 v1, 1
10871
+ ; GFX9-NEXT: v_add_u32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
10872
+ ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0
10873
+ ; GFX9-NEXT: s_mov_b32 s4, 0x41200000
10874
+ ; GFX9-NEXT: v_mul_f32_e32 v1, 0x3dcccccd, v0
10875
+ ; GFX9-NEXT: v_trunc_f32_e32 v1, v1
10876
+ ; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v1
10877
+ ; GFX9-NEXT: v_mad_f32 v0, -v1, s4, v0
10878
+ ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4
10879
+ ; GFX9-NEXT: v_mov_b32_e32 v1, 0
10880
+ ; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v2, vcc
10881
+ ; GFX9-NEXT: v_and_b32_e32 v0, 0x1ff, v0
10882
+ ; GFX9-NEXT: s_setpc_b64 s[30:31]
10883
+ %zextend = zext i8 %size to i64
10884
+ %num = add nuw nsw i64 1 , %zextend
10885
+ %div = udiv i64 %num , 10
10886
+ ret i64 %div
10887
+ }
0 commit comments